Python cat Examples

Programming Language: Python

Namespace/Package Name: pysam

Method/Function: cat

Examples at hotexamples.com: 10

Python cat - 10 examples found. These are the top rated real world Python examples of pysam.cat extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: scatac.py Project: mfiers/rat

def makeAggregate(cells, directory, suffix, output):
    """
    Create aggregate sample.

    Make an aggregate bam file from a list of cells, sorts and indexes
    the file for easy use in IGV. Suffix is required to prevent non
    0-padded numbers matching the wrong files. Return final file name.

    Parameters
    ----------
    cells : list
        List of cell names to create aggregate from.
    directory : string
        Directory path with the bam files from each cell.
    suffix : string
        String to match the end of the bam file, use to add file extension
        and to anchor the extension after file numbers - this will prevent
        cell_4 matching cell_4*.
    output : string
        String containing output file location.
    """
    from glob import glob
    cells = set(cells)
    fileList = []
    for cell in cells:
        fileList.append(glob(os.path.join(directory, "*" + cell + suffix))[0])
    pysam.cat("-o", output + ".bam", *fileList, catch_stdout=False)
    pysam.sort(output + ".bam", output + ".sorted", catch_stdout=False)
    pysam.index(output + ".sorted.bam", catch_stdout=False)

    return output + ".sorted.bam"

Example #2

Show file

def makeAggregate(cells, directory, suffix, output):
    """
    Create aggregate sample.

    Make an aggregate bam file from a list of cells, sorts and indexes
    the file for easy use in IGV. Suffix is required to prevent non
    0-padded numbers matching the wrong files. Return final file name.

    Parameters
    ----------
    cells : list
        List of cell names to create aggregate from.
    directory : string
        Directory path with the bam files from each cell.
    suffix : string
        String to match the end of the bam file, use to add file extension
        and to anchor the extension after file numbers - this will prevent
        cell_4 matching cell_4*.
    output : string
        String containing output file location.
    """
    from glob import glob
    cells = set(cells)
    fileList = []
    for cell in cells:
        fileList.append(glob(os.path.join(directory, "*" + cell + suffix))[0])
    pysam.cat("-o", output + ".bam", *fileList, catch_stdout=False)
    pysam.sort("-T", output + "_tmp", output + ".bam", "-o", output + ".sorted.bam", catch_stdout=False)
    pysam.index(output + ".sorted.bam", catch_stdout=False)

    return output + ".sorted.bam"

Example #3

Show file

File: BAMboozle.py Project: sandberg-lab/dataprivacy

def collect_bam_chunks(inpath, chrs, outpath, unmapped):
    allpaths = [inpath + ".tmp." + c + ".bam" for c in chrs]
    if unmapped:
        allpaths = [inpath + ".tmp." + c + ".bam" for c in chrs[:-1]]
        allpaths.append(inpath + ".tmp." + "unmapped" + ".bam")
    cat_args = ['-o', outpath] + allpaths
    pysam.cat(*cat_args)
    x = [os.remove(f) for f in allpaths]

Example #4

Show file

def merge_bam(bam_collection,
              output_path,
              template_bam=None,
              sort_order=None,
              threads=1):
    """Merge a readname sorted collection of BAM files."""
    bam_collection = [bam for bam in bam_collection if os.path.exists(bam)]
    if not template_bam:
        template_bam = bam_collection[0]
    pysam.cat('-h', template_bam, '-o', output_path, *bam_collection)
    for bam in bam_collection:
        try:
            os.remove(bam)
        except OSError:
            pass
    if sort_order:
        sort_bam(inpath=output_path,
                 output=output_path,
                 sort_order=sort_order,
                 threads=threads)
    return output_path

Example #5

Show file

File: merge_test.py Project: jhrf/Playground

def run_merge_types():
	bam_paths = sys.argv[1:]

	dum_obj 	= pysam.Samfile(sys.argv[1],"rb")
	unique  	= int(time.time())
	multi_path  = "merge_multi_%d.bam" % (unique,)
	multi_out   = pysam.Samfile(multi_path,"wb",template=dum_obj) 

	start = time.time()
	print "Starting Multi File Merge"
	for cur_path in bam_paths:
		cur_obj = pysam.Samfile(cur_path,"rb")
		for alig in cur_obj.fetch(until_eof=True):
			multi_out.write(alig)
		cur_obj.close()
	multi_out.close()

	print "Multi took %d" % (int(time.time() - start))

	single_path  = "merge_single_%d.bam" % (unique,)
	single_out = pysam.Samfile(single_path,"wb",template=dum_obj)

	start = time.time()
	single_obj = pysam.Samfile(multi_path,"rb")
	print "Starting Single File Merge"
	for alig in single_obj.fetch(until_eof=True):
		single_out.write(alig)
	print "Single took %d" % (int(time.time() - start))
	
	os.remove(multi_path)
	os.remove(single_path)

	print "Starting Samtools Cat"
	start = time.time()
	cat_path = "merge_cat_%d.bam" % (unique,)
	pysam.cat("-o",cat_path,*bam_paths)
	print "Samtools took %d" % (int(time.time() - start))

	dum_obj.close()
	os.remove(cat_path)

Example #6

Show file

File: premap_to_reference.py Project: 5l1v3r1/hivwholeseq

def premap_stampy(data_folder,
                  adaID,
                  VERBOSE=0,
                  threads=1,
                  summary=True,
                  maxreads=-1,
                  subsrate=0.05,
                  gapopen=40,
                  gapextend=3):
    '''Call stampy for actual mapping'''
    if VERBOSE:
        print 'Premapping: adaID ', adaID

    if summary:
        summary_filename = get_premap_summary_filename(data_folder, adaID)

    # Stampy can handle both gzipped and uncompressed fastq inputs
    input_filenames = get_read_filenames(data_folder, adaID, gzip=True)
    if not os.path.isfile(input_filenames[0]):
        input_filenames = get_read_filenames(data_folder, adaID, gzip=False)
    if not all(map(os.path.isfile, input_filenames)):
        raise OSError('Input files for mapping not found: ' +
                      input_filenames[0])

    # parallelize if requested
    if threads == 1:
        call_list = [
            stampy_bin,
            '--overwrite',
            '-g',
            get_reference_premap_index_filename(data_folder, adaID, ext=False),
            '-h',
            get_reference_premap_hash_filename(data_folder, adaID, ext=False),
            '-o',
            get_premapped_filename(data_folder, adaID, type='sam'),
            '--insertsize=450',
            '--insertsd=100',
            '--substitutionrate=' + str(subsrate),
            '--gapopen=' + str(gapopen),
            '--gapextend=' + str(gapextend),
        ]
        if maxreads > 0:
            call_list.append('--numrecords=' + str(maxreads))
        call_list.extend(['-M'] + input_filenames)
        call_list = map(str, call_list)
        if VERBOSE >= 2:
            print ' '.join(call_list)
        sp.call(call_list)

        if summary:
            with open(get_premap_summary_filename(data_folder, adaID),
                      'a') as f:
                f.write('\nStampy premapped (single thread).\n')

        # Convert to compressed BAM
        convert_sam_to_bam(
            get_premapped_filename(data_folder, adaID, type='bam'))

        if summary:
            with open(summary_filename, 'a') as f:
                f.write('\nSAM file converted to compressed BAM: '+\
                        get_premapped_filename(data_folder, adaID, type='bam')+'\n')

    else:

        # Multithreading works as follows: call qsub + stampy, monitor the process
        # IDs with qstat at regular intervals, and finally merge results with pysam
        output_file_parts = [
            get_premapped_filename(data_folder,
                                   adaID,
                                   type='bam',
                                   part=(j + 1)) for j in xrange(threads)
        ]

        # Submit map script
        jobs_done = np.zeros(threads, bool)
        job_IDs = np.zeros(threads, 'S30')

        # Submit map call
        import hivwholeseq
        JOBDIR = hivwholeseq.__path__[0].rstrip('/') + '/'
        JOBLOGOUT = JOBDIR + 'logout'
        JOBLOGERR = JOBDIR + 'logerr'
        cluster_time = ['23:59:59', '1:59:59']
        vmem = '8G'
        for j in xrange(threads):
            call_list = [
                'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT,
                '-e', JOBLOGERR, '-N', adaID + ' p' + str(j + 1), '-l',
                'h_rt=' + cluster_time[threads >= 30], '-l', 'h_vmem=' + vmem,
                stampy_bin, '--overwrite', '-g',
                get_reference_premap_index_filename(
                    data_folder, adaID, ext=False), '-h',
                get_reference_premap_hash_filename(
                    data_folder, adaID, ext=False), '-o',
                get_premapped_filename(
                    data_folder, adaID, type='sam', part=(j + 1)),
                '--processpart=' + str(j + 1) + '/' + str(threads),
                '--insertsize=450', '--insertsd=100', '--substitutionrate=' +
                str(subsrate), '--gapopen=' + str(gapopen),
                '--gapextend=' + str(gapextend), '-M'
            ] + input_filenames
            call_list = map(str, call_list)
            if VERBOSE >= 2:
                print ' '.join(call_list)
            job_ID = sp.check_output(call_list)
            job_ID = job_ID.split()[2]
            job_IDs[j] = job_ID

        # Monitor output
        time_wait = 10  # secs
        while not jobs_done.all():

            # Sleep some time
            time.sleep(time_wait)

            # Get the output of qstat to check the status of jobs
            qstat_output = sp.check_output(['qstat'])
            qstat_output = qstat_output.split(
                '\n')[:-1]  # The last is an empty line
            if VERBOSE >= 3:
                print qstat_output
            if len(qstat_output) < 3:
                jobs_done[:] = True
                break
            else:
                qstat_output = [line.split()[0] for line in qstat_output[2:]]

            time_wait = 10  # secs
            for j in xrange(threads):
                if jobs_done[j]:
                    continue

                if job_IDs[j] not in qstat_output:
                    # Convert to BAM for merging
                    if VERBOSE >= 1:
                        print 'Convert premapped reads to BAM for merging: adaID '+\
                               adaID+', part '+str(j+1)+ ' of '+ \
                               str(threads)
                    convert_sam_to_bam(output_file_parts[j])
                    # We do not need to wait if we did the conversion (it takes
                    # longer than some secs)
                    time_wait = 0
                    jobs_done[j] = True

        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Stampy premapped (' + str(threads) + ' threads).\n')

        # Concatenate output files
        if VERBOSE >= 1:
            print 'Concatenate premapped reads: adaID ' + adaID + '...',
        output_filename = get_premapped_filename(data_folder,
                                                 adaID,
                                                 type='bam',
                                                 unsorted=True)
        pysam.cat('-o', output_filename, *output_file_parts)
        if VERBOSE >= 1:
            print 'done.'
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('BAM files concatenated (unsorted).\n')

        # Sort the file by read names (to ensure the pair_generator)
        # NOTE: we exclude the extension and the option -f because of a bug in samtools
        if VERBOSE >= 1:
            print 'Sort premapped reads: adaID ' + adaID
        output_filename_sorted = get_premapped_filename(data_folder,
                                                        adaID,
                                                        type='bam',
                                                        unsorted=False)
        pysam.sort('-n', output_filename, output_filename_sorted[:-4])
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file sorted.\n')

        # Reheader the file without BAM -> SAM -> BAM
        if VERBOSE >= 1:
            print 'Reheader premapped reads: adaID ' + adaID
        header_filename = get_premapped_filename(data_folder,
                                                 adaID,
                                                 type='sam',
                                                 part=1)
        pysam.reheader(header_filename, output_filename_sorted)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file reheaded.\n')

    if VERBOSE >= 1:
        print 'Remove temporary files: adaID ' + adaID
    remove_premapped_tempfiles(data_folder, adaID, VERBOSE=VERBOSE)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Temp premapping files removed.\n')
            f.write('\n')

Example #7

Show file

File: map_to_consensus.py Project: iosonofabio/hivwholeseq

def map_stampy(data_folder, adaID, fragment, VERBOSE=0, threads=1,
               cluster_time='23:59:59', maxreads=-1, summary=True,
               rescue=False, dry=False):
    '''Map using stampy'''
    frag_gen = fragment[:2]

    if summary:
        summary_filename = get_map_summary_filename(data_folder, adaID, frag_gen,
                                                    rescue=rescue)

    # Set mapping penalty scores: softer for rescues and F3 and F5
    global subsrate
    if rescue:
        subsrate = '0.2'
        stampy_gapopen = 5	    # Default: 40
        stampy_gapextend = 1 	    # Default: 3

    elif frag_gen not in ('F3', 'F5'):
        stampy_gapopen = 60	    # Default: 40
        stampy_gapextend = 5 	    # Default: 3
    else:
        stampy_gapopen = 30	    # Default: 40
        stampy_gapextend = 2 	    # Default: 3

    if VERBOSE:
        print 'Map via stampy: '+adaID+' '+frag_gen

    if not rescue: 
        input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam')

        # NOTE: we introduced fragment nomenclature late, e.g. F3a. Check for that
        if not os.path.isfile(input_filename):
            if frag_gen == 'F3':
                input_filename = input_filename.replace('F3a', 'F3')

    else:
        input_filename = get_divided_filename(data_folder, adaID, 'unmapped', type='bam')

    # Check existance of input file, because stampy creates output anyway
    if not os.path.isfile(input_filename):
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Failed (input file for mapping not found).\n')

        raise ValueError(samplename+', fragment '+fragment+': input file not found.')

    # parallelize if requested
    if threads == 1:

        output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam',
                                              rescue=rescue)

        # Map
        call_list = [stampy_bin,
                     '-g', get_index_file(data_folder, adaID, frag_gen, ext=False),
                     '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), 
                     '-o', output_filename,
                     '--overwrite',
                     '--substitutionrate='+subsrate,
                     '--gapopen', stampy_gapopen,
                     '--gapextend', stampy_gapextend]
        if stampy_sensitive:
            call_list.append('--sensitive')

        # Take only a (random) subsample: stampy uses the fraction of reads
        # intead of the number
        if maxreads > 0:
            # FIXME: figure out the -s option and the --numrecords option
            call_list.extend(['--numrecords', maxreads])
            
            #n_pairs_tot = get_number_reads(input_filename, 'bam') / 2
            #frac_pairs = 1.0 * maxreads / n_pairs_tot
            #random_seed = np.random.randint(1e5)
            #call_list.extend(['-s', frac_pairs + random_seed])

        call_list = call_list + ['-M', input_filename]
        call_list = map(str, call_list)
        if VERBOSE >=2:
            print ' '.join(call_list)

        if not dry:
            sp.call(call_list)

            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Stampy mapped (single thread).\n')

            output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam',
                                                  rescue=rescue)
            convert_sam_to_bam(output_filename)
        else:
            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Dry run works (single thread).\n')

            if VERBOSE >= 1:
                print 'Dry run works (single thread)'

            return

    else:

        # Submit map script
        jobs_done = np.zeros(threads, bool)
        job_IDs = np.zeros(threads, 'S30')
        for j in xrange(threads):
        
            # Get output filename
            output_filename =  get_mapped_filename(data_folder, adaID, frag_gen,
                                               type='sam', part=(j+1), rescue=rescue)
            # Map
            call_list = ['qsub','-cwd',
                         '-b', 'y',
                         '-S', '/bin/bash',
                         '-o', JOBLOGOUT,
                         '-e', JOBLOGERR,
                         '-N', 'm'+adaID.replace('-', '')+frag_gen+str(j+1),
                         '-l', 'h_rt='+cluster_time,
                         '-l', 'h_vmem='+vmem,
                         stampy_bin,
                         '-g', get_index_file(data_folder, adaID, frag_gen, ext=False),
                         '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), 
                         '-o', output_filename,
                         '--overwrite',
                         '--processpart='+str(j+1)+'/'+str(threads),
                         '--substitutionrate='+subsrate,
                         '--gapopen', stampy_gapopen,
                         '--gapextend', stampy_gapextend]
            if stampy_sensitive:
                call_list.append('--sensitive')
            call_list = call_list + ['-M', input_filename]
            call_list = map(str, call_list)
            if VERBOSE >= 2:
                print ' '.join(call_list)

            if not dry:
                job_ID = sp.check_output(call_list)
                job_ID = job_ID.split()[2]
                job_IDs[j] = job_ID

        if dry:
            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Dry run works (multi thread).\n')

            if VERBOSE >= 1:
                print 'Dry run works (multi thread)'
            return

        # Monitor output
        output_file_parts = [get_mapped_filename(data_folder, adaID, frag_gen,
                                                 type='bam', part=(j+1), rescue=rescue)
                              for j in xrange(threads)]
        time_wait = 10 # secs
        while not jobs_done.all():

            # Sleep some time
            time.sleep(time_wait)

            # Get the output of qstat to check the status of jobs
            qstat_output = sp.check_output(['qstat'])
            qstat_output = qstat_output.split('\n')[:-1] # The last is an empty line
            if len(qstat_output) < 3:
                jobs_done[:] = True
                break
            else:
                qstat_output = [line.split()[0] for line in qstat_output[2:]]

            time_wait = 10 # secs
            for j in xrange(threads):
                if jobs_done[j]:
                    continue

                if job_IDs[j] not in qstat_output:
                    # Convert to BAM for merging
                    if VERBOSE >= 1:
                        print 'Convert mapped reads to BAM for merging: adaID '+\
                               adaID+', fragment '+frag_gen+', part '+str(j+1)+ ' of '+ \
                               str(threads)
                    convert_sam_to_bam(output_file_parts[j])
                    # We do not need to wait if we did the conversion (it takes
                    # longer than some secs)
                    time_wait = 0
                    jobs_done[j] = True

        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Stampy mapped ('+str(threads)+' threads).\n')

        # Concatenate output files
        output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam',
                                              unsorted=True, rescue=rescue)
        if VERBOSE >= 1:
            print 'Concatenate mapped reads: adaID '+adaID+', fragment '+frag_gen
        pysam.cat('-o', output_filename, *output_file_parts)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('BAM files concatenated (unsorted).\n')

        # Sort the file by read names (to ensure the pair_generator)
        output_filename_sorted = get_mapped_filename(data_folder, adaID, frag_gen,
                                                     type='bam',
                                                     unsorted=False,
                                                     rescue=rescue)
        # NOTE: we exclude the extension and the option -f because of a bug in samtools
        if VERBOSE >= 1:
            print 'Sort mapped reads: adaID '+adaID+', fragment '+frag_gen
        pysam.sort('-n', output_filename, output_filename_sorted[:-4])
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file sorted.\n')

        # Reheader the file without BAM -> SAM -> BAM
        if VERBOSE >= 1:
            print 'Reheader mapped reads: adaID '+adaID+', fragment '+frag_gen
        header_filename = get_mapped_filename(data_folder, adaID, frag_gen,
                                              type='sam', part=1, rescue=rescue)
        pysam.reheader(header_filename, output_filename_sorted)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file reheaded.\n')

    # FIXME: check whether temp files are all deleted
    if VERBOSE >= 1:
        print 'Remove temporary files: adaID '+adaID+', fragment '+frag_gen
    remove_mapped_tempfiles(data_folder, adaID, frag_gen, VERBOSE=VERBOSE, rescue=rescue)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Temp mapping files removed.\n')
            f.write('\n')

Example #8

Show file

def map_stampy(data_folder,
               adaID,
               fragment,
               VERBOSE=0,
               threads=1,
               cluster_time='23:59:59',
               maxreads=-1,
               summary=True,
               rescue=False,
               dry=False):
    '''Map using stampy'''
    frag_gen = fragment[:2]

    if summary:
        summary_filename = get_map_summary_filename(data_folder,
                                                    adaID,
                                                    frag_gen,
                                                    rescue=rescue)

    # Set mapping penalty scores: softer for rescues and F3 and F5
    global subsrate
    if rescue:
        subsrate = '0.2'
        stampy_gapopen = 5  # Default: 40
        stampy_gapextend = 1  # Default: 3

    elif frag_gen not in ('F3', 'F5'):
        stampy_gapopen = 60  # Default: 40
        stampy_gapextend = 5  # Default: 3
    else:
        stampy_gapopen = 30  # Default: 40
        stampy_gapextend = 2  # Default: 3

    if VERBOSE:
        print 'Map via stampy: ' + adaID + ' ' + frag_gen

    if not rescue:
        input_filename = get_divided_filename(data_folder,
                                              adaID,
                                              fragment,
                                              type='bam')

        # NOTE: we introduced fragment nomenclature late, e.g. F3a. Check for that
        if not os.path.isfile(input_filename):
            if frag_gen == 'F3':
                input_filename = input_filename.replace('F3a', 'F3')

    else:
        input_filename = get_divided_filename(data_folder,
                                              adaID,
                                              'unmapped',
                                              type='bam')

    # Check existance of input file, because stampy creates output anyway
    if not os.path.isfile(input_filename):
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Failed (input file for mapping not found).\n')

        raise ValueError(samplename + ', fragment ' + fragment +
                         ': input file not found.')

    # parallelize if requested
    if threads == 1:

        output_filename = get_mapped_filename(data_folder,
                                              adaID,
                                              frag_gen,
                                              type='sam',
                                              rescue=rescue)

        # Map
        call_list = [
            stampy_bin, '-g',
            get_index_file(data_folder, adaID, frag_gen, ext=False), '-h',
            get_hash_file(data_folder, adaID, frag_gen, ext=False), '-o',
            output_filename, '--overwrite', '--substitutionrate=' + subsrate,
            '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend
        ]
        if stampy_sensitive:
            call_list.append('--sensitive')

        # Take only a (random) subsample: stampy uses the fraction of reads
        # intead of the number
        if maxreads > 0:
            # FIXME: figure out the -s option and the --numrecords option
            call_list.extend(['--numrecords', maxreads])

            #n_pairs_tot = get_number_reads(input_filename, 'bam') / 2
            #frac_pairs = 1.0 * maxreads / n_pairs_tot
            #random_seed = np.random.randint(1e5)
            #call_list.extend(['-s', frac_pairs + random_seed])

        call_list = call_list + ['-M', input_filename]
        call_list = map(str, call_list)
        if VERBOSE >= 2:
            print ' '.join(call_list)

        if not dry:
            sp.call(call_list)

            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Stampy mapped (single thread).\n')

            output_filename = get_mapped_filename(data_folder,
                                                  adaID,
                                                  frag_gen,
                                                  type='bam',
                                                  rescue=rescue)
            convert_sam_to_bam(output_filename)
        else:
            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Dry run works (single thread).\n')

            if VERBOSE >= 1:
                print 'Dry run works (single thread)'

            return

    else:

        # Submit map script
        jobs_done = np.zeros(threads, bool)
        job_IDs = np.zeros(threads, 'S30')
        for j in xrange(threads):

            # Get output filename
            output_filename = get_mapped_filename(data_folder,
                                                  adaID,
                                                  frag_gen,
                                                  type='sam',
                                                  part=(j + 1),
                                                  rescue=rescue)
            # Map
            call_list = [
                'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT,
                '-e', JOBLOGERR, '-N',
                'm' + adaID.replace('-', '') + frag_gen + str(j + 1), '-l',
                'h_rt=' + cluster_time, '-l', 'h_vmem=' + vmem, stampy_bin,
                '-g',
                get_index_file(data_folder, adaID, frag_gen, ext=False), '-h',
                get_hash_file(data_folder, adaID, frag_gen,
                              ext=False), '-o', output_filename, '--overwrite',
                '--processpart=' + str(j + 1) + '/' + str(threads),
                '--substitutionrate=' + subsrate, '--gapopen', stampy_gapopen,
                '--gapextend', stampy_gapextend
            ]
            if stampy_sensitive:
                call_list.append('--sensitive')
            call_list = call_list + ['-M', input_filename]
            call_list = map(str, call_list)
            if VERBOSE >= 2:
                print ' '.join(call_list)

            if not dry:
                job_ID = sp.check_output(call_list)
                job_ID = job_ID.split()[2]
                job_IDs[j] = job_ID

        if dry:
            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Dry run works (multi thread).\n')

            if VERBOSE >= 1:
                print 'Dry run works (multi thread)'
            return

        # Monitor output
        output_file_parts = [
            get_mapped_filename(data_folder,
                                adaID,
                                frag_gen,
                                type='bam',
                                part=(j + 1),
                                rescue=rescue) for j in xrange(threads)
        ]
        time_wait = 10  # secs
        while not jobs_done.all():

            # Sleep some time
            time.sleep(time_wait)

            # Get the output of qstat to check the status of jobs
            qstat_output = sp.check_output(['qstat'])
            qstat_output = qstat_output.split(
                '\n')[:-1]  # The last is an empty line
            if len(qstat_output) < 3:
                jobs_done[:] = True
                break
            else:
                qstat_output = [line.split()[0] for line in qstat_output[2:]]

            time_wait = 10  # secs
            for j in xrange(threads):
                if jobs_done[j]:
                    continue

                if job_IDs[j] not in qstat_output:
                    # Convert to BAM for merging
                    if VERBOSE >= 1:
                        print 'Convert mapped reads to BAM for merging: adaID '+\
                               adaID+', fragment '+frag_gen+', part '+str(j+1)+ ' of '+ \
                               str(threads)
                    convert_sam_to_bam(output_file_parts[j])
                    # We do not need to wait if we did the conversion (it takes
                    # longer than some secs)
                    time_wait = 0
                    jobs_done[j] = True

        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Stampy mapped (' + str(threads) + ' threads).\n')

        # Concatenate output files
        output_filename = get_mapped_filename(data_folder,
                                              adaID,
                                              frag_gen,
                                              type='bam',
                                              unsorted=True,
                                              rescue=rescue)
        if VERBOSE >= 1:
            print 'Concatenate mapped reads: adaID ' + adaID + ', fragment ' + frag_gen
        pysam.cat('-o', output_filename, *output_file_parts)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('BAM files concatenated (unsorted).\n')

        # Sort the file by read names (to ensure the pair_generator)
        output_filename_sorted = get_mapped_filename(data_folder,
                                                     adaID,
                                                     frag_gen,
                                                     type='bam',
                                                     unsorted=False,
                                                     rescue=rescue)
        # NOTE: we exclude the extension and the option -f because of a bug in samtools
        if VERBOSE >= 1:
            print 'Sort mapped reads: adaID ' + adaID + ', fragment ' + frag_gen
        pysam.sort('-n', output_filename, output_filename_sorted[:-4])
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file sorted.\n')

        # Reheader the file without BAM -> SAM -> BAM
        if VERBOSE >= 1:
            print 'Reheader mapped reads: adaID ' + adaID + ', fragment ' + frag_gen
        header_filename = get_mapped_filename(data_folder,
                                              adaID,
                                              frag_gen,
                                              type='sam',
                                              part=1,
                                              rescue=rescue)
        pysam.reheader(header_filename, output_filename_sorted)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file reheaded.\n')

    # FIXME: check whether temp files are all deleted
    if VERBOSE >= 1:
        print 'Remove temporary files: adaID ' + adaID + ', fragment ' + frag_gen
    remove_mapped_tempfiles(data_folder,
                            adaID,
                            frag_gen,
                            VERBOSE=VERBOSE,
                            rescue=rescue)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Temp mapping files removed.\n')
            f.write('\n')

Example #9

Show file

File: premap_to_reference.py Project: iosonofabio/hivwholeseq

def premap_stampy(data_folder,
                  adaID,
                  VERBOSE=0,
                  threads=1,
                  summary=True,
                  maxreads=-1,
                  subsrate=0.05,
                  gapopen=40,
                  gapextend=3):
    '''Call stampy for actual mapping'''
    if VERBOSE:
        print 'Premapping: adaID ', adaID

    if summary:
        summary_filename = get_premap_summary_filename(data_folder, adaID)

    # Stampy can handle both gzipped and uncompressed fastq inputs
    input_filenames = get_read_filenames(data_folder, adaID, gzip=True)
    if not os.path.isfile(input_filenames[0]):
        input_filenames = get_read_filenames(data_folder, adaID, gzip=False)
    if not all(map(os.path.isfile, input_filenames)):
        raise OSError('Input files for mapping not found: ' +
                      input_filenames[0])

    # parallelize if requested
    if threads == 1:
        call_list = [
            stampy_bin,
            '--overwrite',
            '-g',
            get_reference_premap_index_filename(data_folder, adaID, ext=False),
            '-h',
            get_reference_premap_hash_filename(data_folder, adaID, ext=False),
            '-o',
            get_premapped_filename(data_folder, adaID, type='sam'),
            '--insertsize=450',
            '--insertsd=100',
            '--substitutionrate=' + str(subsrate),
            '--gapopen=' + str(gapopen),
            '--gapextend=' + str(gapextend),
        ]
        if maxreads > 0:
            call_list.append('--numrecords=' + str(maxreads))
        call_list.extend(['-M'] + input_filenames)
        call_list = map(str, call_list)
        if VERBOSE >= 2:
            print ' '.join(call_list)
        sp.call(call_list)

        if summary:
            with open(get_premap_summary_filename(data_folder, adaID),
                      'a') as f:
                f.write('\nStampy premapped (single thread).\n')

        # Convert to compressed BAM
        convert_sam_to_bam(
            get_premapped_filename(data_folder, adaID, type='bam'))

        if summary:
            with open(summary_filename, 'a') as f:
                f.write('\nSAM file converted to compressed BAM: '+\
                        get_premapped_filename(data_folder, adaID, type='bam')+'\n')

    else:

        # Multithreading works as follows: call qsub + stampy, monitor the process
        # IDs with qstat at regular intervals, and finally merge results with pysam
        output_file_parts = [
            get_premapped_filename(
                data_folder, adaID, type='bam', part=(j + 1))
            for j in xrange(threads)
        ]

        # Submit map script
        jobs_done = np.zeros(threads, bool)
        job_IDs = np.zeros(threads, 'S30')

        # Submit map call
        import hivwholeseq
        JOBDIR = hivwholeseq.__path__[0].rstrip('/') + '/'
        JOBLOGOUT = JOBDIR + 'logout'
        JOBLOGERR = JOBDIR + 'logerr'
        cluster_time = ['23:59:59', '1:59:59']
        vmem = '8G'
        for j in xrange(threads):
            call_list = [
                'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT,
                '-e', JOBLOGERR, '-N', adaID + ' p' + str(j + 1), '-l',
                'h_rt=' + cluster_time[threads >= 30], '-l', 'h_vmem=' + vmem,
                stampy_bin, '--overwrite', '-g',
                get_reference_premap_index_filename(
                    data_folder, adaID, ext=False), '-h',
                get_reference_premap_hash_filename(
                    data_folder, adaID, ext=False), '-o',
                get_premapped_filename(
                    data_folder, adaID, type='sam', part=(j + 1)),
                '--processpart=' + str(j + 1) + '/' + str(threads),
                '--insertsize=450', '--insertsd=100', '--substitutionrate=' +
                str(subsrate), '--gapopen=' + str(gapopen),
                '--gapextend=' + str(gapextend), '-M'
            ] + input_filenames
            call_list = map(str, call_list)
            if VERBOSE >= 2:
                print ' '.join(call_list)
            job_ID = sp.check_output(call_list)
            job_ID = job_ID.split()[2]
            job_IDs[j] = job_ID

        # Monitor output
        time_wait = 10  # secs
        while not jobs_done.all():

            # Sleep some time
            time.sleep(time_wait)

            # Get the output of qstat to check the status of jobs
            qstat_output = sp.check_output(['qstat'])
            qstat_output = qstat_output.split(
                '\n')[:-1]  # The last is an empty line
            if VERBOSE >= 3:
                print qstat_output
            if len(qstat_output) < 3:
                jobs_done[:] = True
                break
            else:
                qstat_output = [line.split()[0] for line in qstat_output[2:]]

            time_wait = 10  # secs
            for j in xrange(threads):
                if jobs_done[j]:
                    continue

                if job_IDs[j] not in qstat_output:
                    # Convert to BAM for merging
                    if VERBOSE >= 1:
                        print 'Convert premapped reads to BAM for merging: adaID '+\
                               adaID+', part '+str(j+1)+ ' of '+ \
                               str(threads)
                    convert_sam_to_bam(output_file_parts[j])
                    # We do not need to wait if we did the conversion (it takes
                    # longer than some secs)
                    time_wait = 0
                    jobs_done[j] = True

        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Stampy premapped (' + str(threads) + ' threads).\n')

        # Concatenate output files
        if VERBOSE >= 1:
            print 'Concatenate premapped reads: adaID ' + adaID + '...',
        output_filename = get_premapped_filename(
            data_folder, adaID, type='bam', unsorted=True)
        pysam.cat('-o', output_filename, *output_file_parts)
        if VERBOSE >= 1:
            print 'done.'
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('BAM files concatenated (unsorted).\n')

        # Sort the file by read names (to ensure the pair_generator)
        # NOTE: we exclude the extension and the option -f because of a bug in samtools
        if VERBOSE >= 1:
            print 'Sort premapped reads: adaID ' + adaID
        output_filename_sorted = get_premapped_filename(
            data_folder, adaID, type='bam', unsorted=False)
        pysam.sort('-n', output_filename, output_filename_sorted[:-4])
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file sorted.\n')

        # Reheader the file without BAM -> SAM -> BAM
        if VERBOSE >= 1:
            print 'Reheader premapped reads: adaID ' + adaID
        header_filename = get_premapped_filename(
            data_folder, adaID, type='sam', part=1)
        pysam.reheader(header_filename, output_filename_sorted)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file reheaded.\n')

    if VERBOSE >= 1:
        print 'Remove temporary files: adaID ' + adaID
    remove_premapped_tempfiles(data_folder, adaID, VERBOSE=VERBOSE)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Temp premapping files removed.\n')
            f.write('\n')

Example #10

Show file

File: map_to_initial_reference.py Project: 5l1v3r1/hivwholeseq

def map_stampy_multithread(sample, fragment, VERBOSE=0, threads=2, summary=True,
                           filtered=True):
    '''Map using stampy, multithread (via cluster requests, queueing race conditions possible)'''
    import hivwholeseq
    JOBDIR = hivwholeseq.__path__[0].rstrip('/')+'/'
    JOBLOGOUT = JOBDIR+'logout/'
    JOBLOGERR = JOBDIR+'logerr/'
    cluster_time = ['23:59:59', '0:59:59']
    vmem = '8G'

    pname = patient.id
    sample = patient.sample_table.loc[samplename]
    seq_run = sample['run']
    data_folder = MiSeq_runs[seq_run]['folder']
    adaID = sample['adaID']

    if VERBOSE:
        print 'Map via stampy: '+pname+' '+samplename+' '+fragment

    if summary:
        summary_filename = get_map_initial_summary_filename(pname, samplename, fragment)

    # Specific fragment (e.g. F5 --> F5bi)
    frag_spec = filter(lambda x: fragment in x, sample['fragments'])
    if not len(frag_spec):
        raise ValueError(str(patient)+', '+samplename+': fragment '+fragment+' not found.')
    frag_spec = frag_spec[0]

    input_filename = get_input_filename(data_folder, adaID, frag_spec, type='bam')

    # Submit map scripts in parallel to the cluster
    jobs_done = np.zeros(threads, bool)
    job_IDs = np.zeros(threads, 'S30')
    for j in xrange(threads):
    
        output_filename = get_mapped_to_initial_filename(pname, samplename,
                                                         fragment,
                                                         type='sam', part=(j+1))
        # Map
        call_list = ['qsub','-cwd',
                     '-b', 'y',
                     '-S', '/bin/bash',
                     '-o', JOBLOGOUT,
                     '-e', JOBLOGERR,
                     '-N', 'm '+samplename+fragment+' p'+str(j+1),
                     '-l', 'h_rt='+cluster_time[threads >= 10],
                     '-l', 'h_vmem='+vmem,
                     stampy_bin,
                     '--overwrite',
                     '-g', get_initial_index_filename(pname, fragment, ext=False),
                     '-h', get_initial_hash_filename(pname, fragment, ext=False),
                     '-o', output_filename,
                     '--processpart='+str(j+1)+'/'+str(threads),
                     '--substitutionrate='+subsrate,
                     '--gapopen', stampy_gapopen,
                     '--gapextend', stampy_gapextend]
        if stampy_sensitive:
            call_list.append('--sensitive')
        call_list = call_list + ['-M', input_filename]

        call_list = map(str, call_list)
        if VERBOSE >= 2:
            print ' '.join(call_list)
        job_ID = sp.check_output(call_list)
        job_ID = job_ID.split()[2]
        job_IDs[j] = job_ID

    # Monitor output
    output_file_parts = [get_mapped_to_initial_filename(pname, samplename,
                                                        fragment,
                                                        type='bam', part=(j+1))
                         for j in xrange(threads)]
    time_wait = 10 # secs
    while not jobs_done.all():

        # Sleep some time
        time.sleep(time_wait)

        # Get the output of qstat to check the status of jobs
        qstat_output = sp.check_output(['qstat'])
        qstat_output = qstat_output.split('\n')[:-1] # The last is an empty line
        if len(qstat_output) < 3:
            jobs_done[:] = True
            break
        else:
            qstat_output = [line.split()[0] for line in qstat_output[2:]]

        time_wait = 10 # secs
        for j in xrange(threads):
            if jobs_done[j]:
                continue

            if job_IDs[j] not in qstat_output:
                # Convert to BAM for merging
                if VERBOSE >= 1:
                    print 'Convert mapped reads to BAM for merging: sample '+\
                           samplename+', part '+str(j+1)+ ' of '+ \
                           str(threads)
                convert_sam_to_bam(output_file_parts[j])
                # We do not need to wait if we did the conversion (it takes
                # longer than some secs)
                time_wait = 0
                jobs_done[j] = True

    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Stampy mapped ('+str(threads)+' threads).\n')

    # Concatenate output files
    output_filename = get_mapped_to_initial_filename(pname, samplename,
                                                     fragment,
                                                     type='bam', unsorted=True)
    if VERBOSE >= 1:
        print 'Concatenate premapped reads: sample '+samplename
    pysam.cat('-o', output_filename, *output_file_parts)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('BAM files concatenated (unsorted).\n')

    # Sort the file by read names (to ensure the pair_generator)
    output_filename_sorted = get_mapped_to_initial_filename(pname, samplename,
                                                            fragment,
                                                            type='bam')
    # NOTE: we exclude the extension and the option -f because of a bug in samtools
    if VERBOSE >= 1:
        print 'Sort mapped reads: sample '+samplename
    pysam.sort('-n', output_filename, output_filename_sorted[:-4])
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Joint BAM file sorted.\n')

    # Reheader the file without BAM -> SAM -> BAM
    if VERBOSE >= 1:
        print 'Reheader mapped reads: sample '+samplename
    header_filename = get_mapped_to_initial_filename(pname, samplename,
                                                     fragment,
                                                     type='sam', part=1)
    pysam.reheader(header_filename, output_filename_sorted)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Joint BAM file reheaded.\n')

    if VERBOSE >= 1:
        print 'Remove temporary files: sample '+samplename
    remove_mapped_init_tempfiles(pname, samplename, fragment, VERBOSE=VERBOSE)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Temp mapping files removed.\n')
            f.write('\n')