Example #1
0
def merge_pair_libs(dataset, run_id):
    """Merge read pairs from Illumina sample libs and output FastA."""
    # identify inputs and outputs
    set_id = dataset['set_id']
    print " ", set_id
    run_root = root_dir+set_id+"/"+run_id+"/"
    dmx_root = run_root+dirs['demux']
    merged_root = run_root+dirs['merged']
    report_root = run_root+dirs['reports']
    master_file = run_root+dirs['merged']+run_id+".fas"
    ensure_dir(merged_root)
    ensure_dir(report_root)
    merger_file = report_root+"merged_pairs.html"
    cntsplt = report_root+"merge_counts"
    samples = dataset['samples']
    # set up files for reporting
    html_comps = ["<p><b>Read pairs merged for run ", run_id, "</b></p>",
                  "<p><img src='merge_counts.png' alt='merge_counts'/></p>",
                  "<p><table border='1'><tr>",
                  "<th>Sample</th>",
                  "<th>Accepted</th>",
                  "<th>Rejected</th>",
                  "<th>Total</th>",
                  "<th>% OK</th></tr>"]
    html_block = "".join(html_comps)
    open(merger_file, 'w').write(html_block)
    # initialize master file
    open(master_file, 'w').write('')
    # merge per sample (demuxed)
    merge_countA = []
    merge_countR = []
    sample_ids = samples.keys()
    for sample_id in sample_ids:
        print "\t", sample_id,
        lib_file = dmx_root+sample_id+"_readpairs.txt"
        merge_out = merged_root+sample_id+"_merged.fas"
        open(merge_out, 'w').write('')
        # prepare container and files for output batching and reporting
        buffer = []
        countY = 0
        countF = 0
        countN = 0
        # iterate through the read pairs
        count = 0
        for titles, seqs, quals in FastqGGIterator(open(lib_file)):
            count +=1
            seq1 = seqs[0]
            seq2 = seqs[1]
            qual1 = quals[0]
            qual2 = quals[1]
            # merge reads   TODO: better safeguard against merge failure
            try: merged = merge_overlaps(seq1, qual1, seq2, qual2)
            except: countF +=1
            else:
                if merged.find('N') > -1:
                    countN +=1  # if there are still N quality must be too low
                else:
                    countY +=1
                    # compose string for output
                    mcomps = [">",sample_id,"_",str(count),"\n",merged,"\n"]
                    mstring = "".join(mcomps)
                    # output to buffer
                    buffer.append(mstring)
            # when buffer capacity is reached, output to file and reset buffer
            if countY % 10000==0:
                dump_buffer(merge_out, buffer)
                dump_buffer(master_file, buffer)
                buffer = []
        # write out whatever remains in the buffer
        dump_buffer(merge_out, buffer)
        dump_buffer(master_file, buffer)
        # sum up
        assert countY+countF+countN == count
        print count, "pairs", datetime.now()
        print "\t\t", str(countY), "merged and accepted"
        print "\t\t", str(countN), "merged but rejected due to residual Ns"
        print "\t\t", str(countF), "failed to merge"
        # add line in QC file
        html_comps = ["<tr>",
                      "<th>", sample_id, "</b></th>",
                      "<td>", str(countY), "</td>",
                      "<td>", str(countN + countF), "</td>",
                      "<td>", str(count), "</td>",
                      "<td>", str(int((float(countY)/count)*100)),
                      "</td></tr>"]
        html_block = "".join(html_comps)
        open(merger_file, 'a').write(html_block)
        # pass values
        merge_countA.append(countY)
        merge_countR.append(countN+countF)
    # close table and add notes
    line_N = "either failed to merge or still contained Ns after merging"
    html_comps = ["</table></p>",
                  "<p><b>", "Rejected", ":</b> ", line_N, "</p>"]
    html_block = "".join(html_comps)
    open(merger_file, 'a').write(html_block)
    # plot the read counts per sample
    series = merge_countA, merge_countR
    lgnd = 'Accepted', 'Rejected'
    colors = 'g', 'r'
    titles = 'Number of read pairs', 'Read pairs merged per sample'
    two_storey_bar_chart(series, sample_ids, lgnd, colors, cntsplt, titles)
Example #2
0
def demux_illumina(dataset, max_pairs, run_id):
    """Demultiplex Illumina dataset.

    From separate forward/reverse read sets, combine read pairs and output
    to separate files for each sample based on barcode tags. As part of the
    process, reject read pairs that have mismatching tags or primers and trim
    the rest, removing primer+tag and low-quality sequences.
    """
    # identify inputs and outputs
    set_id = dataset['set_id']
    print " ", set_id
    run_root = root_dir+set_id+"/"+run_id+"/"
    ori_root = root_dir+set_id+"/"+dirs['master']
    fwd_file = ori_root+dataset['source_fwd']
    rev_file = ori_root+dataset['source_rev']
    demux_root = run_root+dirs['demux']
    report_root = run_root+dirs['reports']
    qc_dir = "qc_details/"
    qc_main_file = report_root+"quality_control.html"
    cntsplt = report_root+"sample_counts"
    ensure_dir(ori_root)
    ensure_dir(demux_root)
    ensure_dir(report_root)
    ensure_dir(report_root+qc_dir)
    # set up files for reporting
    html_comps = ["<p><b>Quality control for run "+run_id+"</b></p>",
                  "<p><img src='sample_counts.png' alt='sample_counts'/></p>",
                  "<p><table border='1'><tr>",
                  "<th>Sample</th>",
                  "<th>Accepted</th>",
                  "<th>Rejected</th>",
                  "<th>Total</th>",
                  "<th>% OK</th></tr>"]
    html_block = "".join(html_comps)
    open(qc_main_file, 'w').write(html_block)
    # prepare primers and barcodes info
    primers = dataset['primers']
    samples = dataset['samples']
    tag_pairs = samples.values()
    assert len(primers) >= 2
    assert len(samples) >= 1
    assert len(tag_pairs) >= 1
    # prepare container and files for output batching and reporting
    hits_dict = {}
    for sample_id in samples:
        hits_dict[sample_id] = {'buffer': [], 'countY': 0, 'countN': 0}
    # add containers for rejected read pairs
    hits_dict['bad_tags'] = {'buffer': [], 'countY': 0, 'countN': 0}
    hits_dict['bad_qual'] = {'buffer': [], 'countY': 0, 'countN': 0}
    # initialize files
    for sample_id in samples:
        dmx_out = demux_root+sample_id+"_readpairs.txt"
        open(dmx_out, 'w').write('')
    open(demux_root+"bad_tags"+"_readpairs.txt", 'w').write('')
    open(demux_root+"bad_qual"+"_readpairs.txt", 'w').write('')
    # iterate through reads
    pair_count = 0
    for titles, seqs, quals in FastqJointIterator(open(fwd_file),
                                                  open(rev_file)) :
        F_title = titles[0][0]
        R_title = titles[0][1]
        F_seq = seqs[0][0].upper()
        R_seq = seqs[0][1].upper()
        F_qual = quals[0][0]
        R_qual = quals[0][1]
        flip = False
        sample_id = False
        # iterate through barcode tags
        # TODO: implement more robust solution to ambiguous base problem
        for tag_pair in tag_pairs:
            L_tag1 = (tag_pair[0]+primers['fwdRA']).upper()
            L_tag2 = (tag_pair[0]+primers['fwdRG']).upper()
            R_tag = (tag_pair[1]+primers['rev']).upper()
            tag_hit = False
            while True:
                # start by checking For R_tag since there's only one
                if not R_seq.find(R_tag, 0, len(R_tag)) is 0:
                    if not F_seq.find(R_tag, 0, len(R_tag)) is 0:
                        # no R_tag match -> reject
                        break
                    else: # is there an L_tag in R_seq?
                        while True:
                            if not R_seq.find(L_tag1, 0, len(L_tag1)) is 0:
                                if not R_seq.find(L_tag2, 0, len(L_tag2)) is 0:
                                    # no L_tag match -> reject
                                    break
                                else:
                                    R_clip = len(L_tag2)
                            else:
                                R_clip = len(L_tag1)
                            tag_hit = True
                            flip = True
                            F_clip = len(R_tag)
                            break
                else: # is there an L_tag in F_seq?
                    while True:
                        if not F_seq.find(L_tag1, 0, len(L_tag1)) is 0:
                            if not F_seq.find(L_tag2, 0, len(L_tag2)) is 0:
                                # no L_tag match -> reject
                                break
                            else:
                                F_clip = len(L_tag2)
                        else:
                            F_clip = len(L_tag1)
                        tag_hit = True
                        R_clip = len(R_tag)
                        break
                break
            if not tag_hit:     # continue iterating
                sample_id = False
            else:               # got it, stop iterating
                sample_id = key_by_value(samples, tag_pair)[0]
                break
        # in case no matches were found with any of the tags
        if not sample_id:
            sample_id = 'bad_tags'
        # for matched read pairs, clip off tag+primer and strip low qual runs
        else:
            F_trim = F_qual[F_clip:].find('##')
            if F_trim > -1:
                F_seq = F_seq[F_clip:F_clip+F_trim]
                F_qual = F_qual[F_clip:F_clip+F_trim]
            else:
                F_seq = F_seq[F_clip:]
                F_qual = F_qual[F_clip:]
            R_trim = R_qual[R_clip:].find('##')
            if R_trim > -1:
                R_seq = R_seq[R_clip:R_clip+R_trim]
                R_qual = R_qual[R_clip:R_clip+R_trim]
            else:
                R_seq = R_seq[R_clip:]
                R_qual = R_qual[R_clip:]
            if len(F_seq)+len(R_seq) < rp_min_len:
                # increment sample hit 'No' counter
                hits_dict[sample_id]['countN'] +=1
                sample_id = 'bad_qual'
        # bundle read data in ordered string
        readF = str("@%s\n%s\n+\n%s\n" % (F_title, F_seq, F_qual))
        readR = str("@%s\n%s\n+\n%s\n" % (R_title, R_seq, R_qual))
        if flip:
            read_pair = readR+readF
        else:
            read_pair = readF+readR
        # output to the appropriate buffer
        hits_dict[sample_id]['buffer'].append(read_pair)
        # increment sample 'Yes' hit counter
        hits_dict[sample_id]['countY'] +=1
        # when buffer capacity is reached, output to file and reset buffer
        if hits_dict[sample_id]['countY']% 100000==0:
            dmx_out = demux_root+sample_id+"_readpairs.txt"
            dump_buffer(dmx_out, hits_dict[sample_id]['buffer'])
            hits_dict[sample_id]['buffer'] = []
        # increment counter
        pair_count +=1
        # report on the progress
        if pair_count%1000000==0:
            print "\t", pair_count, "reads processed", datetime.now()
        if pair_count == max_pairs: # for testing purposes
            break
    print "\t", "Total", pair_count, "read pairs processed"
    print "\t", "Counts per sample:"
    # prepare graphing data containers
    pcntY = []
    pcntN = []
    sample_ids = []
    # write out whatever remains in each of the samples buffers
    for sample_id in samples:
        dmx_out = demux_root+sample_id+"_readpairs.txt"
        dump_buffer(dmx_out, hits_dict[sample_id]['buffer'])
        hits_dict[sample_id]['buffer'] = []
        acc = hits_dict[sample_id]['countY']
        rej = hits_dict[sample_id]['countN']
        print "\t\t", sample_id, acc, "pairs", datetime.now()
        pcntY.append(acc)
        pcntN.append(rej)
        sample_ids.append(sample_id)
        # generate FastQC report (use --noextract to not open zipped reports)
        run_FastQC(dmx_out, report_root+qc_dir, '--quiet', ' ')
        #print "see QC report"
        # add line in QC file
        link = qc_dir+sample_id+"_readpairs_fastqc/fastqc_report.html"
        html_comps = ["<tr>",
                      "<th><a href='"+link+"'>"+sample_id+"</a></th>",
                      "<td>", str(acc), "</td>",
                      "<td>", str(rej), "</td>",
                      "<td>", str(acc+rej), "</td>",
                      "<td>", str(int((float(acc)/(acc+rej))*100)),
                      "</td></tr>"]
        html_block = "".join(html_comps)
        open(qc_main_file, 'a').write(html_block)
    # write out whatever remains in the bad_qual buffer
    dmx_out = demux_root+"bad_qual_readpairs.txt"
    dump_buffer(dmx_out, hits_dict['bad_qual']['buffer'])
    hits_dict['bad_qual']['buffer'] = []
    print "\t\t", "rejected (low quality)", hits_dict['bad_qual']['countY'],\
    datetime.now()
    # generate FastQC report (use --noextract to not open zipped reports)
    run_FastQC(dmx_out, report_root+qc_dir, '--quiet', ' ')
    #print "see QC report"
    # add line in QC file
    link = qc_dir+"bad_qual_readpairs_fastqc/fastqc_report.html"
    html_comps = ["<tr>",
                  "<th><a href='"+link+"'>"+"bad_qual"+"</a></th>",
                  "<td>", '0', "</td>",
                  "<td>", str(hits_dict['bad_qual']['countY']), "</td>",
                  "<td>", str(hits_dict['bad_qual']['countY']), "</td>",
                  "<td>", '0',"</td></tr>"]
    html_block = "".join(html_comps)
    open(qc_main_file, 'a').write(html_block)
    # write out whatever remains in the bad_tags buffer
    dmx_out = demux_root+"bad_tags_readpairs.txt"
    dump_buffer(dmx_out, hits_dict['bad_tags']['buffer'])
    hits_dict['bad_tags']['buffer'] = []
    print "\t\t", "rejected (bad tags)", hits_dict['bad_tags']['countY'],\
    datetime.now()
    # generate FastQC report (use --noextract to not open zipped reports)
    run_FastQC(dmx_out, report_root+qc_dir, '--quiet', ' ')
    #print "see QC report"
    # add line in QC file
    link = qc_dir+"bad_tags_readpairs_fastqc/fastqc_report.html"
    html_comps = ["<tr>",
                  "<th><a href='"+link+"'>"+"bad_tags"+"</a></th>",
                  "<td>", '0', "</td>",
                  "<td>", str(hits_dict['bad_tags']['countY']), "</td>",
                  "<td>", str(hits_dict['bad_tags']['countY']), "</td>",
                  "<td>", '0',"</td></tr>"]
    html_block = "".join(html_comps)
    open(qc_main_file, 'a').write(html_block)
    # close table and add notes
    line_bq = "rejected after demultiplexing due to low sequence quality \
    (top stacks in bar chart)"
    line_bt = "could not be assigned to a sample due to mismatches in tag \
    and/or primer"
    html_comps = ["</table></p>",
                  "<p><b>", "bad_qual", ": </b>", line_bq,
                  "<br><b>", "bad_tags", ": </b>", line_bt, "</p>",]
    html_block = "".join(html_comps)
    open(qc_main_file, 'a').write(html_block)
    # add bad tags category for counts graphing (switch is on purpose)
    pcntY.append(hits_dict['bad_tags']['countN'])
    pcntN.append(hits_dict['bad_tags']['countY'])
    sample_ids.append('bad_tags')# check that the totals add up
    assert pair_count == sum(pcntY)+sum(pcntN)
    # plot the read counts per sample
    series = pcntY, pcntN
    legend = 'Accepted', 'Rejected'
    colors = 'g', 'r'
    titles = 'Number of read pairs', 'Read pairs per sample'
    two_storey_bar_chart(series, sample_ids, legend, colors, cntsplt, titles)