def make_index_and_hash(data_folder, adaID, fragment, VERBOSE=0, summary=True):
    '''Make index and hash files for consensus'''
    frag_gen = fragment[:2]

    # NOTE: we can use --overwrite here, because there is no concurrency (every
    # job has its own hash)
    # 1. Make genome index file
    sp.call([stampy_bin,
             '--species="HIV fragment '+frag_gen+'"',
             '--overwrite',
             '-G', get_index_file(data_folder, adaID, frag_gen, ext=False),
             get_consensus_filename(data_folder, adaID, frag_gen, trim_primers=True),
             ])
    if VERBOSE:
        print 'Built index: '+adaID+' '+frag_gen
    
    # 2. Build a hash file
    sp.call([stampy_bin,
             '--overwrite',
             '-g', get_index_file(data_folder, adaID, frag_gen, ext=False),
             '-H', get_hash_file(data_folder, adaID, frag_gen, ext=False),
             ])
    if VERBOSE:
        print 'Built hash: '+adaID+' '+frag_gen

    if summary:
        with open(get_map_summary_filename(data_folder, adaID, frag_gen), 'a') as f:
            f.write('\n')
            f.write('Stampy index and hash written.')
            f.write('\n')
Esempio n. 2
0
def make_index_and_hash(data_folder, adaID, fragment, VERBOSE=0, summary=True):
    '''Make index and hash files for consensus'''
    frag_gen = fragment[:2]

    # NOTE: we can use --overwrite here, because there is no concurrency (every
    # job has its own hash)
    # 1. Make genome index file
    sp.call([
        stampy_bin,
        '--species="HIV fragment ' + frag_gen + '"',
        '--overwrite',
        '-G',
        get_index_file(data_folder, adaID, frag_gen, ext=False),
        get_consensus_filename(data_folder, adaID, frag_gen,
                               trim_primers=True),
    ])
    if VERBOSE:
        print 'Built index: ' + adaID + ' ' + frag_gen

    # 2. Build a hash file
    sp.call([
        stampy_bin,
        '--overwrite',
        '-g',
        get_index_file(data_folder, adaID, frag_gen, ext=False),
        '-H',
        get_hash_file(data_folder, adaID, frag_gen, ext=False),
    ])
    if VERBOSE:
        print 'Built hash: ' + adaID + ' ' + frag_gen

    if summary:
        with open(get_map_summary_filename(data_folder, adaID, frag_gen),
                  'a') as f:
            f.write('\n')
            f.write('Stampy index and hash written.')
            f.write('\n')
def map_stampy(data_folder, adaID, fragment, VERBOSE=0, threads=1,
               cluster_time='23:59:59', maxreads=-1, summary=True,
               rescue=False, dry=False):
    '''Map using stampy'''
    frag_gen = fragment[:2]

    if summary:
        summary_filename = get_map_summary_filename(data_folder, adaID, frag_gen,
                                                    rescue=rescue)

    # Set mapping penalty scores: softer for rescues and F3 and F5
    global subsrate
    if rescue:
        subsrate = '0.2'
        stampy_gapopen = 5	    # Default: 40
        stampy_gapextend = 1 	    # Default: 3

    elif frag_gen not in ('F3', 'F5'):
        stampy_gapopen = 60	    # Default: 40
        stampy_gapextend = 5 	    # Default: 3
    else:
        stampy_gapopen = 30	    # Default: 40
        stampy_gapextend = 2 	    # Default: 3

    if VERBOSE:
        print 'Map via stampy: '+adaID+' '+frag_gen

    if not rescue: 
        input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam')

        # NOTE: we introduced fragment nomenclature late, e.g. F3a. Check for that
        if not os.path.isfile(input_filename):
            if frag_gen == 'F3':
                input_filename = input_filename.replace('F3a', 'F3')

    else:
        input_filename = get_divided_filename(data_folder, adaID, 'unmapped', type='bam')

    # Check existance of input file, because stampy creates output anyway
    if not os.path.isfile(input_filename):
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Failed (input file for mapping not found).\n')

        raise ValueError(samplename+', fragment '+fragment+': input file not found.')

    # parallelize if requested
    if threads == 1:

        output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam',
                                              rescue=rescue)

        # Map
        call_list = [stampy_bin,
                     '-g', get_index_file(data_folder, adaID, frag_gen, ext=False),
                     '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), 
                     '-o', output_filename,
                     '--overwrite',
                     '--substitutionrate='+subsrate,
                     '--gapopen', stampy_gapopen,
                     '--gapextend', stampy_gapextend]
        if stampy_sensitive:
            call_list.append('--sensitive')

        # Take only a (random) subsample: stampy uses the fraction of reads
        # intead of the number
        if maxreads > 0:
            # FIXME: figure out the -s option and the --numrecords option
            call_list.extend(['--numrecords', maxreads])
            
            #n_pairs_tot = get_number_reads(input_filename, 'bam') / 2
            #frac_pairs = 1.0 * maxreads / n_pairs_tot
            #random_seed = np.random.randint(1e5)
            #call_list.extend(['-s', frac_pairs + random_seed])

        call_list = call_list + ['-M', input_filename]
        call_list = map(str, call_list)
        if VERBOSE >=2:
            print ' '.join(call_list)

        if not dry:
            sp.call(call_list)

            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Stampy mapped (single thread).\n')

            output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam',
                                                  rescue=rescue)
            convert_sam_to_bam(output_filename)
        else:
            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Dry run works (single thread).\n')

            if VERBOSE >= 1:
                print 'Dry run works (single thread)'

            return

    else:

        # Submit map script
        jobs_done = np.zeros(threads, bool)
        job_IDs = np.zeros(threads, 'S30')
        for j in xrange(threads):
        
            # Get output filename
            output_filename =  get_mapped_filename(data_folder, adaID, frag_gen,
                                               type='sam', part=(j+1), rescue=rescue)
            # Map
            call_list = ['qsub','-cwd',
                         '-b', 'y',
                         '-S', '/bin/bash',
                         '-o', JOBLOGOUT,
                         '-e', JOBLOGERR,
                         '-N', 'm'+adaID.replace('-', '')+frag_gen+str(j+1),
                         '-l', 'h_rt='+cluster_time,
                         '-l', 'h_vmem='+vmem,
                         stampy_bin,
                         '-g', get_index_file(data_folder, adaID, frag_gen, ext=False),
                         '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), 
                         '-o', output_filename,
                         '--overwrite',
                         '--processpart='+str(j+1)+'/'+str(threads),
                         '--substitutionrate='+subsrate,
                         '--gapopen', stampy_gapopen,
                         '--gapextend', stampy_gapextend]
            if stampy_sensitive:
                call_list.append('--sensitive')
            call_list = call_list + ['-M', input_filename]
            call_list = map(str, call_list)
            if VERBOSE >= 2:
                print ' '.join(call_list)

            if not dry:
                job_ID = sp.check_output(call_list)
                job_ID = job_ID.split()[2]
                job_IDs[j] = job_ID

        if dry:
            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Dry run works (multi thread).\n')

            if VERBOSE >= 1:
                print 'Dry run works (multi thread)'
            return

        # Monitor output
        output_file_parts = [get_mapped_filename(data_folder, adaID, frag_gen,
                                                 type='bam', part=(j+1), rescue=rescue)
                              for j in xrange(threads)]
        time_wait = 10 # secs
        while not jobs_done.all():

            # Sleep some time
            time.sleep(time_wait)

            # Get the output of qstat to check the status of jobs
            qstat_output = sp.check_output(['qstat'])
            qstat_output = qstat_output.split('\n')[:-1] # The last is an empty line
            if len(qstat_output) < 3:
                jobs_done[:] = True
                break
            else:
                qstat_output = [line.split()[0] for line in qstat_output[2:]]

            time_wait = 10 # secs
            for j in xrange(threads):
                if jobs_done[j]:
                    continue

                if job_IDs[j] not in qstat_output:
                    # Convert to BAM for merging
                    if VERBOSE >= 1:
                        print 'Convert mapped reads to BAM for merging: adaID '+\
                               adaID+', fragment '+frag_gen+', part '+str(j+1)+ ' of '+ \
                               str(threads)
                    convert_sam_to_bam(output_file_parts[j])
                    # We do not need to wait if we did the conversion (it takes
                    # longer than some secs)
                    time_wait = 0
                    jobs_done[j] = True

        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Stampy mapped ('+str(threads)+' threads).\n')

        # Concatenate output files
        output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam',
                                              unsorted=True, rescue=rescue)
        if VERBOSE >= 1:
            print 'Concatenate mapped reads: adaID '+adaID+', fragment '+frag_gen
        pysam.cat('-o', output_filename, *output_file_parts)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('BAM files concatenated (unsorted).\n')

        # Sort the file by read names (to ensure the pair_generator)
        output_filename_sorted = get_mapped_filename(data_folder, adaID, frag_gen,
                                                     type='bam',
                                                     unsorted=False,
                                                     rescue=rescue)
        # NOTE: we exclude the extension and the option -f because of a bug in samtools
        if VERBOSE >= 1:
            print 'Sort mapped reads: adaID '+adaID+', fragment '+frag_gen
        pysam.sort('-n', output_filename, output_filename_sorted[:-4])
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file sorted.\n')

        # Reheader the file without BAM -> SAM -> BAM
        if VERBOSE >= 1:
            print 'Reheader mapped reads: adaID '+adaID+', fragment '+frag_gen
        header_filename = get_mapped_filename(data_folder, adaID, frag_gen,
                                              type='sam', part=1, rescue=rescue)
        pysam.reheader(header_filename, output_filename_sorted)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file reheaded.\n')

    # FIXME: check whether temp files are all deleted
    if VERBOSE >= 1:
        print 'Remove temporary files: adaID '+adaID+', fragment '+frag_gen
    remove_mapped_tempfiles(data_folder, adaID, frag_gen, VERBOSE=VERBOSE, rescue=rescue)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Temp mapping files removed.\n')
            f.write('\n')
        for fragment in fragments_sample:

            frag_gen = fragment[:2]

            # Submit to the cluster self if requested
            if submit:
                fork_self(seq_run, adaID, frag_gen,
                          VERBOSE=VERBOSE,
                          threads=threads, maxreads=maxreads,
                          filter_reads=filter_reads,
                          summary=summary,
                          rescue=use_rescue)
                continue

            if summary:
                sfn = get_map_summary_filename(data_folder, adaID, frag_gen,
                                               rescue=use_rescue)
                with open(sfn, 'w') as f:
                    f.write('Call: python map_to_consensus.py'+\
                            ' --run '+seq_run+\
                            ' --adaIDs '+adaID+\
                            ' --fragments '+frag_gen+\
                            ' --threads '+str(threads)+\
                            ' --verbose '+str(VERBOSE))
                    if maxreads != -1:
                        f.write(' --maxreads '+str(maxreads))
                    if filter_reads:
                        f.write(' --filter')
                    f.write('\n')

            
            if not check_consensus_length(data_folder, adaID, fragment, VERBOSE=VERBOSE):
Esempio n. 5
0
def map_stampy(data_folder,
               adaID,
               fragment,
               VERBOSE=0,
               threads=1,
               cluster_time='23:59:59',
               maxreads=-1,
               summary=True,
               rescue=False,
               dry=False):
    '''Map using stampy'''
    frag_gen = fragment[:2]

    if summary:
        summary_filename = get_map_summary_filename(data_folder,
                                                    adaID,
                                                    frag_gen,
                                                    rescue=rescue)

    # Set mapping penalty scores: softer for rescues and F3 and F5
    global subsrate
    if rescue:
        subsrate = '0.2'
        stampy_gapopen = 5  # Default: 40
        stampy_gapextend = 1  # Default: 3

    elif frag_gen not in ('F3', 'F5'):
        stampy_gapopen = 60  # Default: 40
        stampy_gapextend = 5  # Default: 3
    else:
        stampy_gapopen = 30  # Default: 40
        stampy_gapextend = 2  # Default: 3

    if VERBOSE:
        print 'Map via stampy: ' + adaID + ' ' + frag_gen

    if not rescue:
        input_filename = get_divided_filename(data_folder,
                                              adaID,
                                              fragment,
                                              type='bam')

        # NOTE: we introduced fragment nomenclature late, e.g. F3a. Check for that
        if not os.path.isfile(input_filename):
            if frag_gen == 'F3':
                input_filename = input_filename.replace('F3a', 'F3')

    else:
        input_filename = get_divided_filename(data_folder,
                                              adaID,
                                              'unmapped',
                                              type='bam')

    # Check existance of input file, because stampy creates output anyway
    if not os.path.isfile(input_filename):
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Failed (input file for mapping not found).\n')

        raise ValueError(samplename + ', fragment ' + fragment +
                         ': input file not found.')

    # parallelize if requested
    if threads == 1:

        output_filename = get_mapped_filename(data_folder,
                                              adaID,
                                              frag_gen,
                                              type='sam',
                                              rescue=rescue)

        # Map
        call_list = [
            stampy_bin, '-g',
            get_index_file(data_folder, adaID, frag_gen, ext=False), '-h',
            get_hash_file(data_folder, adaID, frag_gen, ext=False), '-o',
            output_filename, '--overwrite', '--substitutionrate=' + subsrate,
            '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend
        ]
        if stampy_sensitive:
            call_list.append('--sensitive')

        # Take only a (random) subsample: stampy uses the fraction of reads
        # intead of the number
        if maxreads > 0:
            # FIXME: figure out the -s option and the --numrecords option
            call_list.extend(['--numrecords', maxreads])

            #n_pairs_tot = get_number_reads(input_filename, 'bam') / 2
            #frac_pairs = 1.0 * maxreads / n_pairs_tot
            #random_seed = np.random.randint(1e5)
            #call_list.extend(['-s', frac_pairs + random_seed])

        call_list = call_list + ['-M', input_filename]
        call_list = map(str, call_list)
        if VERBOSE >= 2:
            print ' '.join(call_list)

        if not dry:
            sp.call(call_list)

            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Stampy mapped (single thread).\n')

            output_filename = get_mapped_filename(data_folder,
                                                  adaID,
                                                  frag_gen,
                                                  type='bam',
                                                  rescue=rescue)
            convert_sam_to_bam(output_filename)
        else:
            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Dry run works (single thread).\n')

            if VERBOSE >= 1:
                print 'Dry run works (single thread)'

            return

    else:

        # Submit map script
        jobs_done = np.zeros(threads, bool)
        job_IDs = np.zeros(threads, 'S30')
        for j in xrange(threads):

            # Get output filename
            output_filename = get_mapped_filename(data_folder,
                                                  adaID,
                                                  frag_gen,
                                                  type='sam',
                                                  part=(j + 1),
                                                  rescue=rescue)
            # Map
            call_list = [
                'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT,
                '-e', JOBLOGERR, '-N',
                'm' + adaID.replace('-', '') + frag_gen + str(j + 1), '-l',
                'h_rt=' + cluster_time, '-l', 'h_vmem=' + vmem, stampy_bin,
                '-g',
                get_index_file(data_folder, adaID, frag_gen, ext=False), '-h',
                get_hash_file(data_folder, adaID, frag_gen,
                              ext=False), '-o', output_filename, '--overwrite',
                '--processpart=' + str(j + 1) + '/' + str(threads),
                '--substitutionrate=' + subsrate, '--gapopen', stampy_gapopen,
                '--gapextend', stampy_gapextend
            ]
            if stampy_sensitive:
                call_list.append('--sensitive')
            call_list = call_list + ['-M', input_filename]
            call_list = map(str, call_list)
            if VERBOSE >= 2:
                print ' '.join(call_list)

            if not dry:
                job_ID = sp.check_output(call_list)
                job_ID = job_ID.split()[2]
                job_IDs[j] = job_ID

        if dry:
            if summary:
                with open(summary_filename, 'a') as f:
                    f.write('Dry run works (multi thread).\n')

            if VERBOSE >= 1:
                print 'Dry run works (multi thread)'
            return

        # Monitor output
        output_file_parts = [
            get_mapped_filename(data_folder,
                                adaID,
                                frag_gen,
                                type='bam',
                                part=(j + 1),
                                rescue=rescue) for j in xrange(threads)
        ]
        time_wait = 10  # secs
        while not jobs_done.all():

            # Sleep some time
            time.sleep(time_wait)

            # Get the output of qstat to check the status of jobs
            qstat_output = sp.check_output(['qstat'])
            qstat_output = qstat_output.split(
                '\n')[:-1]  # The last is an empty line
            if len(qstat_output) < 3:
                jobs_done[:] = True
                break
            else:
                qstat_output = [line.split()[0] for line in qstat_output[2:]]

            time_wait = 10  # secs
            for j in xrange(threads):
                if jobs_done[j]:
                    continue

                if job_IDs[j] not in qstat_output:
                    # Convert to BAM for merging
                    if VERBOSE >= 1:
                        print 'Convert mapped reads to BAM for merging: adaID '+\
                               adaID+', fragment '+frag_gen+', part '+str(j+1)+ ' of '+ \
                               str(threads)
                    convert_sam_to_bam(output_file_parts[j])
                    # We do not need to wait if we did the conversion (it takes
                    # longer than some secs)
                    time_wait = 0
                    jobs_done[j] = True

        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Stampy mapped (' + str(threads) + ' threads).\n')

        # Concatenate output files
        output_filename = get_mapped_filename(data_folder,
                                              adaID,
                                              frag_gen,
                                              type='bam',
                                              unsorted=True,
                                              rescue=rescue)
        if VERBOSE >= 1:
            print 'Concatenate mapped reads: adaID ' + adaID + ', fragment ' + frag_gen
        pysam.cat('-o', output_filename, *output_file_parts)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('BAM files concatenated (unsorted).\n')

        # Sort the file by read names (to ensure the pair_generator)
        output_filename_sorted = get_mapped_filename(data_folder,
                                                     adaID,
                                                     frag_gen,
                                                     type='bam',
                                                     unsorted=False,
                                                     rescue=rescue)
        # NOTE: we exclude the extension and the option -f because of a bug in samtools
        if VERBOSE >= 1:
            print 'Sort mapped reads: adaID ' + adaID + ', fragment ' + frag_gen
        pysam.sort('-n', output_filename, output_filename_sorted[:-4])
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file sorted.\n')

        # Reheader the file without BAM -> SAM -> BAM
        if VERBOSE >= 1:
            print 'Reheader mapped reads: adaID ' + adaID + ', fragment ' + frag_gen
        header_filename = get_mapped_filename(data_folder,
                                              adaID,
                                              frag_gen,
                                              type='sam',
                                              part=1,
                                              rescue=rescue)
        pysam.reheader(header_filename, output_filename_sorted)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file reheaded.\n')

    # FIXME: check whether temp files are all deleted
    if VERBOSE >= 1:
        print 'Remove temporary files: adaID ' + adaID + ', fragment ' + frag_gen
    remove_mapped_tempfiles(data_folder,
                            adaID,
                            frag_gen,
                            VERBOSE=VERBOSE,
                            rescue=rescue)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Temp mapping files removed.\n')
            f.write('\n')
Esempio n. 6
0
            # Submit to the cluster self if requested
            if submit:
                fork_self(seq_run,
                          adaID,
                          frag_gen,
                          VERBOSE=VERBOSE,
                          threads=threads,
                          maxreads=maxreads,
                          filter_reads=filter_reads,
                          summary=summary,
                          rescue=use_rescue)
                continue

            if summary:
                sfn = get_map_summary_filename(data_folder,
                                               adaID,
                                               frag_gen,
                                               rescue=use_rescue)
                with open(sfn, 'w') as f:
                    f.write('Call: python map_to_consensus.py'+\
                            ' --run '+seq_run+\
                            ' --adaIDs '+adaID+\
                            ' --fragments '+frag_gen+\
                            ' --threads '+str(threads)+\
                            ' --verbose '+str(VERBOSE))
                    if maxreads != -1:
                        f.write(' --maxreads ' + str(maxreads))
                    if filter_reads:
                        f.write(' --filter')
                    f.write('\n')

            if not check_consensus_length(