Example #1
0
def trim_reads(data_folder,
               adaID,
               VERBOSE=0,
               summary=True,
               quality=25,
               blocksize=10,
               minlen_read1=100,
               minlen_read2=50):
    '''Trim low quality at the end of reads'''
    fn_in = get_read_filenames(data_folder, adaID, gzip=True)
    fn_out = get_read_filenames(data_folder, adaID, gzip=True, trimmed=True)

    n_good = 0
    n_discarded = 0

    with gzip.open(fn_in[0], 'rb') as fin1, \
         gzip.open(fn_in[1], 'rb') as fin2, \
         gzip.open(fn_out[0], 'rb') as fout1, \
         gzip.open(fn_out[1], 'wb') as fout2:

        it1 = SeqIO.read(fin1, 'fastq')
        it2 = SeqIO.read(fin2, 'fastq')
        for irp, reads in enumerate(izip(it1, it2)):

            if VERBOSE >= 2:
                if not ((irp + 1) % 10000):
                    print irp + 1

            # Trim both reads
            trims = [
                trim_read(read, quality=quality, blocksize=blocksize)
                for read in reads
            ]

            lrs = map(len, trims)
            if (lrs[0] > minlen_read1) and (lrs[1] > minlen_read2):
                SeqIO.write(trims[0], fout1, 'fastq')
                SeqIO.write(trims[1], fout2, 'fastq')
                n_good += 1
            else:
                n_discarded += 1

    if VERBOSE:
        print 'Trim lowq ends of reads:'
        print 'Good:', n_good
        print 'Discarded:', n_discarded

    # Write summary to file
    if summary:
        with open(get_trim_summary_filename(data_folder, adaID), 'a') as f:
            f.write('\n')
            f.write('Trim low quality ends results: adaID ' + adaID + '\n')
            f.write('Total:\t\t' + str(irp) + '\n')
            f.write('Good:\t\t' + str(n_good) + '\n')
            f.write('Discarded:\t' + str(n_discarded) + '\n')
def trim_reads(data_folder, adaID, VERBOSE=0, summary=True, quality=25, blocksize=10,
               minlen_read1=100, minlen_read2=50):
    '''Trim low quality at the end of reads'''
    fn_in = get_read_filenames(data_folder, adaID, gzip=True)
    fn_out = get_read_filenames(data_folder, adaID, gzip=True, trimmed=True)

    n_good = 0
    n_discarded = 0

    with gzip.open(fn_in[0], 'rb') as fin1, \
         gzip.open(fn_in[1], 'rb') as fin2, \
         gzip.open(fn_out[0], 'rb') as fout1, \
         gzip.open(fn_out[1], 'wb') as fout2:

        it1 = SeqIO.read(fin1, 'fastq')
        it2 = SeqIO.read(fin2, 'fastq')
        for irp, reads in enumerate(izip(it1, it2)):

            if VERBOSE >= 2:
                if not ((irp + 1) % 10000):
                    print irp + 1

            # Trim both reads
            trims = [trim_read(read, quality=quality, blocksize=blocksize)
                     for read in reads]

            lrs = map(len, trims)
            if (lrs[0] > minlen_read1) and (lrs[1] > minlen_read2):
                SeqIO.write(trims[0], fout1, 'fastq')
                SeqIO.write(trims[1], fout2, 'fastq')
                n_good += 1
            else:
                n_discarded += 1

    if VERBOSE:
        print 'Trim lowq ends of reads:'
        print 'Good:', n_good
        print 'Discarded:', n_discarded

    # Write summary to file
    if summary:
        with open(get_trim_summary_filename(data_folder, adaID), 'a') as f:
            f.write('\n')
            f.write('Trim low quality ends results: adaID '+adaID+'\n')
            f.write('Total:\t\t'+str(irp)+'\n')
            f.write('Good:\t\t'+str(n_good)+'\n')
            f.write('Discarded:\t'+str(n_discarded)+'\n')
Example #3
0
def gunzip_demultiplexed_reads(data_folder, adaID, VERBOSE=0):
    '''Gunzip FastQ.gz demultiplexed files'''
    from hivwholeseq.sequencing.filenames import get_read_filenames

    fns = get_read_filenames(data_folder, adaID, gzip=True)
    for fn in fns:
        if not os.path.isfile(fn):
            continue
        sp.call(['gunzip', fn])
        if VERBOSE >= 2:
            print 'Gunzipped:', fn
def premap_stampy(data_folder,
                  adaID,
                  VERBOSE=0,
                  threads=1,
                  summary=True,
                  maxreads=-1,
                  subsrate=0.05,
                  gapopen=40,
                  gapextend=3):
    '''Call stampy for actual mapping'''
    if VERBOSE:
        print 'Premapping: adaID ', adaID

    if summary:
        summary_filename = get_premap_summary_filename(data_folder, adaID)

    # Stampy can handle both gzipped and uncompressed fastq inputs
    input_filenames = get_read_filenames(data_folder, adaID, gzip=True)
    if not os.path.isfile(input_filenames[0]):
        input_filenames = get_read_filenames(data_folder, adaID, gzip=False)
    if not all(map(os.path.isfile, input_filenames)):
        raise OSError('Input files for mapping not found: ' +
                      input_filenames[0])

    # parallelize if requested
    if threads == 1:
        call_list = [
            stampy_bin,
            '--overwrite',
            '-g',
            get_reference_premap_index_filename(data_folder, adaID, ext=False),
            '-h',
            get_reference_premap_hash_filename(data_folder, adaID, ext=False),
            '-o',
            get_premapped_filename(data_folder, adaID, type='sam'),
            '--insertsize=450',
            '--insertsd=100',
            '--substitutionrate=' + str(subsrate),
            '--gapopen=' + str(gapopen),
            '--gapextend=' + str(gapextend),
        ]
        if maxreads > 0:
            call_list.append('--numrecords=' + str(maxreads))
        call_list.extend(['-M'] + input_filenames)
        call_list = map(str, call_list)
        if VERBOSE >= 2:
            print ' '.join(call_list)
        sp.call(call_list)

        if summary:
            with open(get_premap_summary_filename(data_folder, adaID),
                      'a') as f:
                f.write('\nStampy premapped (single thread).\n')

        # Convert to compressed BAM
        convert_sam_to_bam(
            get_premapped_filename(data_folder, adaID, type='bam'))

        if summary:
            with open(summary_filename, 'a') as f:
                f.write('\nSAM file converted to compressed BAM: '+\
                        get_premapped_filename(data_folder, adaID, type='bam')+'\n')

    else:

        # Multithreading works as follows: call qsub + stampy, monitor the process
        # IDs with qstat at regular intervals, and finally merge results with pysam
        output_file_parts = [
            get_premapped_filename(data_folder,
                                   adaID,
                                   type='bam',
                                   part=(j + 1)) for j in xrange(threads)
        ]

        # Submit map script
        jobs_done = np.zeros(threads, bool)
        job_IDs = np.zeros(threads, 'S30')

        # Submit map call
        import hivwholeseq
        JOBDIR = hivwholeseq.__path__[0].rstrip('/') + '/'
        JOBLOGOUT = JOBDIR + 'logout'
        JOBLOGERR = JOBDIR + 'logerr'
        cluster_time = ['23:59:59', '1:59:59']
        vmem = '8G'
        for j in xrange(threads):
            call_list = [
                'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT,
                '-e', JOBLOGERR, '-N', adaID + ' p' + str(j + 1), '-l',
                'h_rt=' + cluster_time[threads >= 30], '-l', 'h_vmem=' + vmem,
                stampy_bin, '--overwrite', '-g',
                get_reference_premap_index_filename(
                    data_folder, adaID, ext=False), '-h',
                get_reference_premap_hash_filename(
                    data_folder, adaID, ext=False), '-o',
                get_premapped_filename(
                    data_folder, adaID, type='sam', part=(j + 1)),
                '--processpart=' + str(j + 1) + '/' + str(threads),
                '--insertsize=450', '--insertsd=100', '--substitutionrate=' +
                str(subsrate), '--gapopen=' + str(gapopen),
                '--gapextend=' + str(gapextend), '-M'
            ] + input_filenames
            call_list = map(str, call_list)
            if VERBOSE >= 2:
                print ' '.join(call_list)
            job_ID = sp.check_output(call_list)
            job_ID = job_ID.split()[2]
            job_IDs[j] = job_ID

        # Monitor output
        time_wait = 10  # secs
        while not jobs_done.all():

            # Sleep some time
            time.sleep(time_wait)

            # Get the output of qstat to check the status of jobs
            qstat_output = sp.check_output(['qstat'])
            qstat_output = qstat_output.split(
                '\n')[:-1]  # The last is an empty line
            if VERBOSE >= 3:
                print qstat_output
            if len(qstat_output) < 3:
                jobs_done[:] = True
                break
            else:
                qstat_output = [line.split()[0] for line in qstat_output[2:]]

            time_wait = 10  # secs
            for j in xrange(threads):
                if jobs_done[j]:
                    continue

                if job_IDs[j] not in qstat_output:
                    # Convert to BAM for merging
                    if VERBOSE >= 1:
                        print 'Convert premapped reads to BAM for merging: adaID '+\
                               adaID+', part '+str(j+1)+ ' of '+ \
                               str(threads)
                    convert_sam_to_bam(output_file_parts[j])
                    # We do not need to wait if we did the conversion (it takes
                    # longer than some secs)
                    time_wait = 0
                    jobs_done[j] = True

        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Stampy premapped (' + str(threads) + ' threads).\n')

        # Concatenate output files
        if VERBOSE >= 1:
            print 'Concatenate premapped reads: adaID ' + adaID + '...',
        output_filename = get_premapped_filename(data_folder,
                                                 adaID,
                                                 type='bam',
                                                 unsorted=True)
        pysam.cat('-o', output_filename, *output_file_parts)
        if VERBOSE >= 1:
            print 'done.'
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('BAM files concatenated (unsorted).\n')

        # Sort the file by read names (to ensure the pair_generator)
        # NOTE: we exclude the extension and the option -f because of a bug in samtools
        if VERBOSE >= 1:
            print 'Sort premapped reads: adaID ' + adaID
        output_filename_sorted = get_premapped_filename(data_folder,
                                                        adaID,
                                                        type='bam',
                                                        unsorted=False)
        pysam.sort('-n', output_filename, output_filename_sorted[:-4])
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file sorted.\n')

        # Reheader the file without BAM -> SAM -> BAM
        if VERBOSE >= 1:
            print 'Reheader premapped reads: adaID ' + adaID
        header_filename = get_premapped_filename(data_folder,
                                                 adaID,
                                                 type='sam',
                                                 part=1)
        pysam.reheader(header_filename, output_filename_sorted)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file reheaded.\n')

    if VERBOSE >= 1:
        print 'Remove temporary files: adaID ' + adaID
    remove_premapped_tempfiles(data_folder, adaID, VERBOSE=VERBOSE)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Temp premapping files removed.\n')
            f.write('\n')
Example #5
0
    parser.add_argument('--adaID',
                        required=True,
                        help='Adapter ID to analyze (e.g. TS2)')

    args = parser.parse_args()
    seq_run = args.run
    VERBOSE = args.verbose
    maxreads = args.maxreads
    adaID = args.adaID

    # Specify the dataset
    dataset = MiSeq_runs[seq_run]
    data_folder = dataset['folder']

    # Get some reads
    fns = get_read_filenames(data_folder, adaID, gzip=True)
    with gzip.open(fns[0], 'rb') as fh1, gzip.open(fns[1], 'rb') as fh2:
        reads_iter1 = SeqIO.parse(fh1, 'fastq')
        reads_iter2 = SeqIO.parse(fh2, 'fastq')

        read_pairs = []
        inds = 20000 + np.arange(100000)
        np.random.shuffle(inds)
        inds = np.sort(inds[:10])
        ii = 0
        for irp, reads in enumerate(izip(reads_iter1, reads_iter2)):
            if irp == inds[ii]:
                read_pairs.append(reads)
                ii += 1

                if ii == len(inds):
Example #6
0
def demultiplex_reads_single_index(data_folder,
                                   data_filenames,
                                   adapters_designed,
                                   maxreads=-1,
                                   VERBOSE=0,
                                   summary=True):
    '''Demultiplex reads with single index adapters'''

    # Get the read filenames
    datafile_read1 = data_filenames['read1']
    datafile_read2 = data_filenames['read2']
    datafile_adapter = data_filenames['adapter']

    # Open output files (compressed)
    fouts = {
        adaID: [
            gzip.open(fn, 'wb', compresslevel=9)
            for fn in get_read_filenames(data_folder, adaID, gzip=True)
        ]
        for adaID, _ in adapters_designed
    }

    fouts['unclassified'] = [
        gzip.open(fn, 'wb', compresslevel=9)
        for fn in get_unclassified_reads_filenames(data_folder, gzip=True)
    ]

    adapters_designed_inv = dict(map(reversed, adapters_designed))
    adapters_strings = map(itemgetter(1), adapters_designed)

    # Make sure you close the files
    try:

        # Iterate over all reads (using fast iterators)
        with gzip.open(datafile_read1, 'rb') as fh1,\
             gzip.open(datafile_read2, 'rb') as fh2,\
             gzip.open(datafile_adapter, 'rb') as fha:

            if VERBOSE >= 3:
                print 'adaID'
                print '--------------------'

            adapters_found = Counter()
            for i, (read1, read2, adapter) in enumerate(
                    izip(FGI(fh1), FGI(fh2), SeqIO.parse(fha, 'fastq'))):

                if i == maxreads:
                    if VERBOSE:
                        print 'Maxreads reached.'
                    break

                # Print some output
                if VERBOSE and (not ((i + 1) % 10000)):
                    print i + 1

                # If the adapter is not known, add it to the list
                adapter_string = str(adapter.seq)
                adapters_found[adapter_string] += 1

                # If the adapter does not match any know one,
                # throw into wastebin folder
                if adapter_string not in adapters_strings:
                    adaID = 'unclassified'
                else:
                    adaID = adapters_designed_inv[adapter_string]

                if VERBOSE >= 3:
                    print adaID

                # Write sequences (append to file, manual but fast)
                fouts[adaID][0].write("@%s\n%s\n+\n%s\n" % read1)
                fouts[adaID][1].write("@%s\n%s\n+\n%s\n" % read2)
                if adapter_string not in adapters_strings:
                    SeqIO.write(adapter, fouts['unclassified'][2], 'fastq')

    finally:
        # Close all adaIDs
        for fout in fouts.itervalues():
            # Close both read 1 and read 2 (and barcode for unclassified)
            for fou in fout:
                fou.close()

        if summary:
            with open(get_demultiplex_summary_filename(data_folder), 'a') as f:
                f.write('\n')
                f.write('Total number of reads demultiplexed: ' + str(i + 1) +
                        '\n')
                f.write('Adapters found across all reads:\n')
                for e in adapters_found.most_common():
                    f.write('\t'.join(map(str, e)) + '\n')
    seq_run = args.run
    VERBOSE = args.verbose
    submit = args.submit
    maxreads = args.maxreads
    adaID = args.adaID
    savefig = args.savefig

    if submit:
        fork_self(seq_run, VERBOSE=VERBOSE, maxreads=maxreads, savefig=savefig)
        sys.exit()

    dataset = load_sequencing_run(seq_run)
    data_folder = dataset.folder
    read_len = dataset.cycles // 2

    reads_filenames = get_read_filenames(data_folder, adaID, gzip=True)
    if not os.path.isfile(reads_filenames[0]):
        reads_filenames = get_read_filenames(data_folder, adaID, gzip=False)
    title = seq_run + ", " + adaID

    quality = quality_score_along_reads(
        read_len, reads_filenames, randomreads=(maxreads >= 1), maxreads=maxreads, VERBOSE=VERBOSE
    )

    plot_cuts_quality_along_reads(data_folder, adaID, quality, title=title, VERBOSE=VERBOSE, savefig=savefig)

    # if plotfull:
    #    plot_quality_along_reads(data_folder, adaID, title,
    #                             quality, VERBOSE=VERBOSE,
    #                             savefig=savefig)
Example #8
0
                        help='Maximal number of reads to analyze')
    parser.add_argument('--adaID', required=True,
                        help='Adapter ID to analyze (e.g. TS2)')

    args = parser.parse_args()
    seq_run = args.run
    VERBOSE = args.verbose
    maxreads = args.maxreads
    adaID = args.adaID

    # Specify the dataset
    dataset = MiSeq_runs[seq_run]
    data_folder = dataset['folder']

    # Get some reads
    fns = get_read_filenames(data_folder, adaID, gzip=True)
    with gzip.open(fns[0], 'rb') as fh1, gzip.open(fns[1], 'rb') as fh2:
        reads_iter1 = SeqIO.parse(fh1, 'fastq')
        reads_iter2 = SeqIO.parse(fh2, 'fastq')

        read_pairs = []
        inds = 20000 + np.arange(100000); np.random.shuffle(inds); inds = np.sort(inds[:10])
        ii = 0
        for irp, reads in enumerate(izip(reads_iter1, reads_iter2)):
            if irp == inds[ii]:
                read_pairs.append(reads)
                ii += 1

                if ii == len(inds):
                    break
def premap_stampy(data_folder,
                  adaID,
                  VERBOSE=0,
                  threads=1,
                  summary=True,
                  maxreads=-1,
                  subsrate=0.05,
                  gapopen=40,
                  gapextend=3):
    '''Call stampy for actual mapping'''
    if VERBOSE:
        print 'Premapping: adaID ', adaID

    if summary:
        summary_filename = get_premap_summary_filename(data_folder, adaID)

    # Stampy can handle both gzipped and uncompressed fastq inputs
    input_filenames = get_read_filenames(data_folder, adaID, gzip=True)
    if not os.path.isfile(input_filenames[0]):
        input_filenames = get_read_filenames(data_folder, adaID, gzip=False)
    if not all(map(os.path.isfile, input_filenames)):
        raise OSError('Input files for mapping not found: ' +
                      input_filenames[0])

    # parallelize if requested
    if threads == 1:
        call_list = [
            stampy_bin,
            '--overwrite',
            '-g',
            get_reference_premap_index_filename(data_folder, adaID, ext=False),
            '-h',
            get_reference_premap_hash_filename(data_folder, adaID, ext=False),
            '-o',
            get_premapped_filename(data_folder, adaID, type='sam'),
            '--insertsize=450',
            '--insertsd=100',
            '--substitutionrate=' + str(subsrate),
            '--gapopen=' + str(gapopen),
            '--gapextend=' + str(gapextend),
        ]
        if maxreads > 0:
            call_list.append('--numrecords=' + str(maxreads))
        call_list.extend(['-M'] + input_filenames)
        call_list = map(str, call_list)
        if VERBOSE >= 2:
            print ' '.join(call_list)
        sp.call(call_list)

        if summary:
            with open(get_premap_summary_filename(data_folder, adaID),
                      'a') as f:
                f.write('\nStampy premapped (single thread).\n')

        # Convert to compressed BAM
        convert_sam_to_bam(
            get_premapped_filename(data_folder, adaID, type='bam'))

        if summary:
            with open(summary_filename, 'a') as f:
                f.write('\nSAM file converted to compressed BAM: '+\
                        get_premapped_filename(data_folder, adaID, type='bam')+'\n')

    else:

        # Multithreading works as follows: call qsub + stampy, monitor the process
        # IDs with qstat at regular intervals, and finally merge results with pysam
        output_file_parts = [
            get_premapped_filename(
                data_folder, adaID, type='bam', part=(j + 1))
            for j in xrange(threads)
        ]

        # Submit map script
        jobs_done = np.zeros(threads, bool)
        job_IDs = np.zeros(threads, 'S30')

        # Submit map call
        import hivwholeseq
        JOBDIR = hivwholeseq.__path__[0].rstrip('/') + '/'
        JOBLOGOUT = JOBDIR + 'logout'
        JOBLOGERR = JOBDIR + 'logerr'
        cluster_time = ['23:59:59', '1:59:59']
        vmem = '8G'
        for j in xrange(threads):
            call_list = [
                'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT,
                '-e', JOBLOGERR, '-N', adaID + ' p' + str(j + 1), '-l',
                'h_rt=' + cluster_time[threads >= 30], '-l', 'h_vmem=' + vmem,
                stampy_bin, '--overwrite', '-g',
                get_reference_premap_index_filename(
                    data_folder, adaID, ext=False), '-h',
                get_reference_premap_hash_filename(
                    data_folder, adaID, ext=False), '-o',
                get_premapped_filename(
                    data_folder, adaID, type='sam', part=(j + 1)),
                '--processpart=' + str(j + 1) + '/' + str(threads),
                '--insertsize=450', '--insertsd=100', '--substitutionrate=' +
                str(subsrate), '--gapopen=' + str(gapopen),
                '--gapextend=' + str(gapextend), '-M'
            ] + input_filenames
            call_list = map(str, call_list)
            if VERBOSE >= 2:
                print ' '.join(call_list)
            job_ID = sp.check_output(call_list)
            job_ID = job_ID.split()[2]
            job_IDs[j] = job_ID

        # Monitor output
        time_wait = 10  # secs
        while not jobs_done.all():

            # Sleep some time
            time.sleep(time_wait)

            # Get the output of qstat to check the status of jobs
            qstat_output = sp.check_output(['qstat'])
            qstat_output = qstat_output.split(
                '\n')[:-1]  # The last is an empty line
            if VERBOSE >= 3:
                print qstat_output
            if len(qstat_output) < 3:
                jobs_done[:] = True
                break
            else:
                qstat_output = [line.split()[0] for line in qstat_output[2:]]

            time_wait = 10  # secs
            for j in xrange(threads):
                if jobs_done[j]:
                    continue

                if job_IDs[j] not in qstat_output:
                    # Convert to BAM for merging
                    if VERBOSE >= 1:
                        print 'Convert premapped reads to BAM for merging: adaID '+\
                               adaID+', part '+str(j+1)+ ' of '+ \
                               str(threads)
                    convert_sam_to_bam(output_file_parts[j])
                    # We do not need to wait if we did the conversion (it takes
                    # longer than some secs)
                    time_wait = 0
                    jobs_done[j] = True

        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Stampy premapped (' + str(threads) + ' threads).\n')

        # Concatenate output files
        if VERBOSE >= 1:
            print 'Concatenate premapped reads: adaID ' + adaID + '...',
        output_filename = get_premapped_filename(
            data_folder, adaID, type='bam', unsorted=True)
        pysam.cat('-o', output_filename, *output_file_parts)
        if VERBOSE >= 1:
            print 'done.'
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('BAM files concatenated (unsorted).\n')

        # Sort the file by read names (to ensure the pair_generator)
        # NOTE: we exclude the extension and the option -f because of a bug in samtools
        if VERBOSE >= 1:
            print 'Sort premapped reads: adaID ' + adaID
        output_filename_sorted = get_premapped_filename(
            data_folder, adaID, type='bam', unsorted=False)
        pysam.sort('-n', output_filename, output_filename_sorted[:-4])
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file sorted.\n')

        # Reheader the file without BAM -> SAM -> BAM
        if VERBOSE >= 1:
            print 'Reheader premapped reads: adaID ' + adaID
        header_filename = get_premapped_filename(
            data_folder, adaID, type='sam', part=1)
        pysam.reheader(header_filename, output_filename_sorted)
        if summary:
            with open(summary_filename, 'a') as f:
                f.write('Joint BAM file reheaded.\n')

    if VERBOSE >= 1:
        print 'Remove temporary files: adaID ' + adaID
    remove_premapped_tempfiles(data_folder, adaID, VERBOSE=VERBOSE)
    if summary:
        with open(summary_filename, 'a') as f:
            f.write('Temp premapping files removed.\n')
            f.write('\n')
    if VERBOSE >= 3:
        print 'adaIDs', adaIDs

    # Iterate over adaIDs
    for adaID in adaIDs:

        outdir = os.getcwd()+'/q_control_'+adaID
        if not os.path.exists(outdir):
            try:
                os.mkdir(outdir)
            except:
                print "cannot make directory:",outdir
                sys.exit(1)
    
        # Repeat both for read 1 and read 2
        datafiles = get_read_filenames(data_folder, adaID, filtered=False)
        datafiles = {'read 1': datafiles[0],
                     'read 2': datafiles[1]}
        histograms = {}
    
        for readname, datafile in datafiles.iteritems():
        
            # Result data structures
            longest_good_block = np.zeros(L+1, int)
            first_bad_nucleotide = np.zeros(L+1, int)
            phred_score_dis = np.zeros((50,L))
        
            # Read data
            with open(datafile, 'r') as f: 
                seq_iter = SeqIO.parse(f, 'fastq')
        
Example #11
0
def demultiplex_reads_single_index(data_folder, data_filenames, adapters_designed,
                                   maxreads=-1, VERBOSE=0, summary=True):
    '''Demultiplex reads with single index adapters'''

    # Get the read filenames
    datafile_read1 = data_filenames['read1']
    datafile_read2 = data_filenames['read2']
    datafile_adapter = data_filenames['adapter']

    # Open output files (compressed)
    fouts = {adaID: [gzip.open(fn, 'wb', compresslevel=9)
             for fn in get_read_filenames(data_folder, adaID, gzip=True)]
             for adaID, _ in adapters_designed}

    fouts['unclassified'] = [gzip.open(fn, 'wb', compresslevel=9)
                             for fn in get_unclassified_reads_filenames(data_folder, gzip=True)]

    adapters_designed_inv = dict(map(reversed, adapters_designed))
    adapters_strings = map(itemgetter(1), adapters_designed)

    # Make sure you close the files
    try:

        # Iterate over all reads (using fast iterators)
        with gzip.open(datafile_read1, 'rb') as fh1,\
             gzip.open(datafile_read2, 'rb') as fh2,\
             gzip.open(datafile_adapter, 'rb') as fha:

            if VERBOSE >= 3:
                print 'adaID'
                print '--------------------'

            adapters_found = Counter()
            for i, (read1, read2, adapter) in enumerate(izip(FGI(fh1), FGI(fh2),
                                                             SeqIO.parse(fha, 'fastq'))):

                if i == maxreads:
                    if VERBOSE:
                        print 'Maxreads reached.'
                    break

                # Print some output
                if VERBOSE and (not ((i + 1) % 10000)):
                    print i + 1

                # If the adapter is not known, add it to the list
                adapter_string = str(adapter.seq)
                adapters_found[adapter_string] += 1

                # If the adapter does not match any know one,
                # throw into wastebin folder
                if adapter_string not in adapters_strings:
                    adaID = 'unclassified'
                else:
                    adaID = adapters_designed_inv[adapter_string]
            
                if VERBOSE >= 3:
                    print adaID

                # Write sequences (append to file, manual but fast)
                fouts[adaID][0].write("@%s\n%s\n+\n%s\n" % read1)
                fouts[adaID][1].write("@%s\n%s\n+\n%s\n" % read2)
                if adapter_string not in adapters_strings:
                    SeqIO.write(adapter, fouts['unclassified'][2], 'fastq')

    finally:
        # Close all adaIDs
        for fout in fouts.itervalues():
            # Close both read 1 and read 2 (and barcode for unclassified)
            for fou in fout:
                fou.close()

        if summary:
            with open(get_demultiplex_summary_filename(data_folder), 'a') as f:
                f.write('\n')
                f.write('Total number of reads demultiplexed: '+str(i+1)+'\n')
                f.write('Adapters found across all reads:\n')
                for e in adapters_found.most_common():
                    f.write('\t'.join(map(str, e))+'\n')
    seq_run = args.run
    VERBOSE = args.verbose
    submit = args.submit
    maxreads = args.maxreads
    adaID = args.adaID
    savefig = args.savefig

    if submit:
        fork_self(seq_run, VERBOSE=VERBOSE, maxreads=maxreads, savefig=savefig)
        sys.exit()

    dataset = load_sequencing_run(seq_run)
    data_folder = dataset.folder
    read_len = dataset.cycles // 2

    reads_filenames = get_read_filenames(data_folder, adaID, gzip=True)
    if not os.path.isfile(reads_filenames[0]):
        reads_filenames = get_read_filenames(data_folder, adaID, gzip=False)
    title = seq_run + ', ' + adaID

    quality = quality_score_along_reads(read_len,
                                        reads_filenames,
                                        randomreads=(maxreads >= 1),
                                        maxreads=maxreads,
                                        VERBOSE=VERBOSE)

    plot_cuts_quality_along_reads(data_folder,
                                  adaID,
                                  quality,
                                  title=title,
                                  VERBOSE=VERBOSE,