예제 #1
0
def generate_report(proj_conf):
    d = { 
        'project_id' : proj_conf['id'],
        'infotable' : "",
        'lanetable' : "",
        'read1table': "",
        'read2table': "",
        'qcplots': "",
        'qc30plots': "",
        'errorrate': "",
        }

    ## General info table
    tab = Texttable()
    tab.add_row(["Project id", proj_conf['id']])
    tab.add_rows([["Run name:", proj_conf['flowcell']],
                  ["Uppnex project", ""]])
    d.update(infotable=tab.draw())
    
    ## Lane table
    tab = Texttable()
    tab.add_row(["Lane", "Sample(s)", "Conc. (pM)"])
    for l in proj_conf['lanes']:
        samples = []
        for mp in l['multiplex']:
            samples.append(mp['name'])
        tab.add_row([l['lane'], ", ".join(samples), ""])
    d.update(lanetable=tab.draw())
                
    ## qcplots
    byCycleDir = os.path.join(proj_conf['archive_dir'], proj_conf['flowcell'], "Data", "reports", "ByCycle")
    res = []
    for l in proj_conf['lanes']:
        res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "QScore_L%s.png" % (l['lane']))), width="100%"))
    d.update(qcplots= "\n".join(res))

    ## qc30plots
    res = []
    for l in proj_conf['lanes']:
        res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "NumGT30_L%s.png" % (l['lane']))), width="100%"))
    d.update(qc30plots= "\n".join(res))

    ## qcplots
    res = []
    for l in proj_conf['lanes']:
        res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "ErrRate_L%s.png" % (l['lane']))), width="100%"))
    d.update(errorrate= "\n".join(res))
                
    return d
예제 #2
0
def generate_report(proj_conf):

    d = {
	'runname':proj_conf['run'],
	'project_id': proj_conf['id'],
        'samplenames': ' '.join(proj_conf['samples']),
        'latex_opt' : "",
        'uppnex': "",
        'mapping':"",
        'dup_rem':"",
        'read_count':"",
        'quantifyer':"",
        'gene_body_cov':"",
        'FPKM_heatmap':"",
        'FPKM_PCAplot':"",
        'Mapping_statistics': "",
        'Read_Distribution':"",
	'rRNA_table':""
        }

    ## Latex option (no of floats per page)
    floats_per_page = '.. raw:: latex\n\n   \setcounter{totalnumber}{8}'
    d['latex_opt'] = floats_per_page


    ## Metadata fetched from the 'Genomics project list' on Google Docs 
    try:
        proj_data = ProjectMetaData(proj_conf['id'], proj_conf['config'])
        uppnex_proj = proj_data.uppnex_id
    except:
        uppnex_proj = "b201YYXX"
        print "No uppnex ID fetched"
	pass
    if not uppnex_proj:
	uppnex_proj="b201YYXX"
        print "No uppnex ID fetched"
    d['uppnex'] = uppnex_proj 


    ## RNA-seq tools fetched from config file post_process.yaml
    try:
        tools      	= proj_conf['config']['custom_algorithms']['RNA-seq analysis']
        d['mapping']	= os.path.join(tools['aligner'],tools['aligner_version'])
        d['dup_rem']    = os.path.join(tools['dup_remover'],tools['dup_remover_version'])
        d['read_count'] = os.path.join(tools['counts'],tools['counts_version'])
        d['quantifyer'] = os.path.join(tools['quantifyer'],tools['quantifyer_version'])
    except:
	print "Could not fetched RNA-seq tools from config file post_process.yaml"
        d['mapping'] = "X"
        d['dup_rem'] = "X"
        d['read_count'] = "X"
        d['quantifyer'] = "X"
        pass


    ## Mapping Statistics
    tab = Texttable()
    tab.set_cols_dtype(['t','t','t','t'])
    tab.add_row(['Sample','tot_#_read_pairs','%_uniquely_mapped_reads','%_uniquely_mapped_reads_left_after_dup_rem'])
    try:
	for sample_name in proj_conf['samples']:
	    f=open('tophat_out_'+sample_name+'/stat_'+sample_name, 'r')
	    data = f.readlines()
	    tab.add_row([sample_name,data[1].split()[1],data[2].split()[1],data[3].split()[1]])
	    f.close()
	d['Mapping_statistics']=tab.draw()
    except:
	try:
            f=open('stat', 'r')
            data = f.readlines()
            D=dict(zip(data[0].split(),zip(data[1].split(),data[2].split(),data[3].split())))
            for sample_name in proj_conf['samples']:
	        if D.has_key(sample_name):
                    tab.add_row([sample_name,D[sample_name][0],D[sample_name][1],D[sample_name][2]])
	        else:
	            print 'kould not find '+sample_name+' in stat'
            d['Mapping_statistics']=tab.draw() 
            f.close()
        except:
	    print "Could not make Mapping Statistics table"
            pass


    ## Read Distribution 
    try:
        tab = Texttable()
        json=open('Ever_rd.json','a')
        print >> json, '{'
        Groups=["Sample:","CDS Exons:","5'UTR Exons:","3'UTR Exons:","Intronic region:","TSS up 1kb:","TES down 1kb:"]

        tab.set_cols_dtype(['t','t','t','t','t','t','t','t'])
        tab.add_row(["Sample","CDS Exon","5'UTR Exon","3'UTR Exon","Intron","TSS up 1kb","TES down 1kb","mRNA frac"])
   	
	for i in range(len(proj_conf['samples'])):
	    sample_name=proj_conf['samples'][i] 
            print >> json, sample_name+': {'
            row=[sample_name]
            Reads_counts=[]
	    try:
		f=open('RSeQC_rd_'+sample_name+'.err','r')
	    except:
            	f=open('Ever_rd_'+sample_name+'.err','r')
		pass
            for line in f:
                Group=line.split('\t')[0]
                if Group in Groups:
                    if Group=="TES down 1kb:":
                        print >> json, '"'+Group+'"'+':'+str(line.split('\t')[3].strip())
                    else:
                        print >> json, '"'+Group+'"'+':'+str(line.split('\t')[3].strip())+','
                    row.append(str(line.split('\t')[3].strip())+' ')
                    Reads_counts.append(float(line.split('\t')[2].strip()))
	    if os.path.exists('RSeQC_rd_'+sample_name+'.err'):
		t=os.popen("grep 'Total Fragments' 'RSeQC_rd_"+sample_name+".err'|sed 's/Total Fragments               //g'")
	    else:
		try:
			t=os.popen("grep 'Total Fragments' 'Ever_rd_"+sample_name+".err'|sed 's/Total Fragments               //g'")
		except:		
			pass
            tot=float(t.readline())
            frac=(Reads_counts[0]+Reads_counts[1]+Reads_counts[2])/tot
            row.append(str(round((Reads_counts[0]+Reads_counts[1]+Reads_counts[2])/tot,2)))
            tab.add_row(row)
            f.close()
            if i==(len(proj_conf['samples'])-1):
                    print >> json,'}'
            else:
                    print >> json,'},'
        print >> json, '}'
        json.close()
        d['Read_Distribution']=tab.draw()

    except:
	print "Could not make Read Distribution table"
        pass


    ## FPKM_PCAplot, FPKM_heatmap
    if os.path.exists("FPKM_PCAplot.pdf") and os.path.exists("FPKM_heatmap.pdf"):
        d['FPKM_PCAplot'] = m2r.image("FPKM_PCAplot.pdf", width="100%")
        d['FPKM_heatmap'] = m2r.image("FPKM_heatmap.pdf", width="100%")
    else:
	print "could not make FPKM PCAplot and FPKM heatmap"


    ## rRNA_table
    try:
        tab = Texttable()
        tab.set_cols_dtype(['t','t'])
        tab.add_row(["Sample","rRNA"])
	f=open('rRNA.quantification','r')
	D={}
	for line in f:
            D[str(line.split('\t')[0].strip())]=str(line.split('\t')[1].strip())
        for sample_name in proj_conf['samples']:
            if D.has_key(sample_name):
                        tab.add_row([sample_name,D[sample_name]])
        d['rRNA_table']=tab.draw()
        f.close()
    except:
	print "could not generate rRNA table"
        pass   
 
    return d
def generate_report(proj_conf):
    
    #######
    ### Metadata fetched from the 'Genomics project list' on Google Docs
    ###
    uppnex_proj = ''
    min_reads_per_sample = ''
    try:
    	proj_data = ProjectMetaData(proj_conf['id'], proj_conf['config'])
    	uppnex_proj = proj_data.uppnex_id
        project_id = proj_data.project_id
        queue_date = proj_data.queue_date
        no_samples = proj_data.no_samples
        lanes_plates = proj_data.lanes_plates
        min_reads_per_sample = proj_data.min_reads_per_sample
        customer_reference = proj_data.customer_reference
        application = proj_data.application
        no_finished_samples = proj_data.no_finished_samples
    except:
        print("WARNING: Could not fetch meta data from Google Docs")

    d = { 
        'project_id' : proj_conf['id'],
        'latex_opt' : "",
        'summary' : "",
        'infotable' : "",
        'lanetable' : "",
        'read1table': "",
        'read2table': "",
        'qcplots': "",
        'qc30plots': "",
        'errorrate': "",
        'yieldtable': "",
        }

    ## Latex option (no of floats per page)
    floats_per_page = '.. raw:: latex\n\n   \setcounter{totalnumber}{8}'
    d.update(latex_opt = floats_per_page)

    ## General info table
    tab = Texttable()
    if not uppnex_proj or len(uppnex_proj) < 4 or uppnex_proj[0:4] != 'b201':
        uppnex_proj = "b201YXXX"
        print "WARNING: Could not find UPPNEX project"

    run_name_comp = proj_conf['flowcell'].split('_')
    simple_run_name = run_name_comp[0] + run_name_comp[3][0]
    proj_level_dir = fixProjName(proj_conf['id'])
    instr_id = run_name_comp[1]
    fc_name, fc_date = get_flowcell_info(proj_conf['flowcell'])
    tab.add_row(["Run name:", proj_conf['flowcell']])
    del_base = "/proj/"
    proj_id = proj_conf['id']
    try: 
        if len(customer_reference) > 1:
            proj_id += ' (' + customer_reference + ')'
    except:
        pass

    tab.add_rows([["Project id:", proj_id], 
                  ["Date:", fc_date],
                  ["Instrument ID:", instr_id],
                  ["Flow cell ID:", fc_name],
                  ["Uppnex project:", uppnex_proj],
                  ["Delivery directory:", del_base + uppnex_proj + "/INBOX/" + proj_level_dir + "/" + proj_conf['flowcell']]])
    d.update(infotable=tab.draw())
    
    ## Lane table
    tab = Texttable()
    tab.add_row(["Lane", "Sample(s)"])
    for l in proj_conf['lanes']:
        main_proj = l['description'].split(',')[1].strip()
        samples = []
        if l.has_key('multiplex'):
            for mp in l['multiplex']:
                if mp.has_key('sample_prj'):
                    if mp['sample_prj'] == proj_conf['id']:
                        samples.append(mp['name'])
            tab.add_row([l['lane'], ", ".join(samples)])
        else:
            tab.add_row([l['lane'], "Non-multiplexed lane"])
    d.update(lanetable=tab.draw())
    
    tab_r1 = Texttable()
    tab_r2 = Texttable()
    tab_r1.set_cols_width([2,12,12,12,12,12,12,30])
    tab_r2.set_cols_width([2,12,12,12,12,12,12,30])
    tab_r1.add_row(["Lane", "Clu. dens. #/mm2","% PF clusters","Clu. PF #/mm2", "% phas/prephas", "% aln PhiX", "% error rate", "Comment"])
    tab_r2.add_row(["Lane", "Clu. dens. #/mm2","% PF clusters","Clu. PF #/mm2", "% phas/prephas", "% aln PhiX", "% error rate", "Comment"])

    # These should be moved to a cfg file. ( + perhaps provide an alternative for v1.5 FC )
    if (options.v1_5_fc): min_clupf = 300 
    else: min_clupf = 475
    max_phas = 0.4
    max_prephas = 1.0 # 0.5
    max_mean_err = 2

    statspath = os.path.join(proj_conf['archive_dir'], proj_conf['flowcell'], "Data", "reports", "Summary")
    stats = summ.getQCstats(statspath)

    # Check quality criteria and add comments
    comm_r1 = ''
    comm_r2 = ''
    ok_r1 = True
    ok_r2 = True
    ok_cludens_r1 = True
    ok_cludens_r2 = True
    ok_err_rate = True 
    ok_err_r1 = True
    ok_err_r2 = True

    for l in proj_conf['lanes']:

        # Cluster densities
        clu_dens_r1 =  stats['raw_cluster_dens']['read1'][l['lane']]
        clu_dens_r2 =  stats['raw_cluster_dens']['read2'][l['lane']]
        clu_dens_sd_r1 =  stats['raw_cluster_dens_sd']['read1'][l['lane']]
        clu_dens_sd_r2 =  stats['raw_cluster_dens_sd']['read2'][l['lane']]
        clu_dens_string_r1 = str(clu_dens_r1) + '+/-' + str(clu_dens_sd_r1) 
        clu_dens_string_r2 = str(clu_dens_r2) + '+/-' + str(clu_dens_sd_r2) 

        # Cluster PF densities
        clu_dens_pf_r1 =  stats['pf_cluster_dens']['read1'][l['lane']]
        clu_dens_pf_r2 =  stats['pf_cluster_dens']['read2'][l['lane']]
        clu_dens_pf_sd_r1 =  stats['pf_cluster_dens_sd']['read1'][l['lane']]
        clu_dens_pf_sd_r2 =  stats['pf_cluster_dens_sd']['read2'][l['lane']]
        clu_dens_pf_string_r1 = str(clu_dens_pf_r1) + '+/-' + str(clu_dens_pf_sd_r1)
        clu_dens_pf_string_r2 = str(clu_dens_pf_r2) + '+/-' + str(clu_dens_pf_sd_r2)

        # % PF clusters
        prc_pf_r1 =  stats['prc_pf']['read1'][l['lane']]
        prc_pf_r2 =  stats['prc_pf']['read2'][l['lane']]
        prc_pf_sd_r1 =  stats['prc_pf_sd']['read1'][l['lane']]
        prc_pf_sd_r2 =  stats['prc_pf_sd']['read2'][l['lane']]
        prc_pf_string_r1 = str(prc_pf_r1) + '+/-' + str(prc_pf_sd_r1)
        prc_pf_string_r2 = str(prc_pf_r2) + '+/-' + str(prc_pf_sd_r2)

        # % phasing and prephasing
        phas_r1 = stats['phasing']['read1'][l['lane']]
        phas_r2 = stats['phasing']['read2'][l['lane']]
        prephas_r1 = stats['prephasing']['read1'][l['lane']]
        prephas_r2 = stats['prephasing']['read2'][l['lane']]
        phas_string_r1 = str(phas_r1) + '/' + str(prephas_r1)
        phas_string_r2 = str(phas_r2) + '/' + str(prephas_r2)

        # % aligned
        aln_r1 = stats['prc_aligned']['read1'][l['lane']]
        aln_r2 = stats['prc_aligned']['read2'][l['lane']]
        aln_sd_r1 = stats['prc_aligned_sd']['read1'][l['lane']]
        aln_sd_r2 = stats['prc_aligned_sd']['read2'][l['lane']]
        aln_string_r1 = str(aln_r1) + '+/-' + str(aln_sd_r1)
        aln_string_r2 = str(aln_r2) + '+/-' + str(aln_sd_r2)

        # error rate
        err_r1 = stats['error_rate']['read1'][l['lane']]
        err_r2 = stats['error_rate']['read2'][l['lane']]
        err_sd_r1 = stats['error_rate_sd']['read1'][l['lane']]
        err_sd_r2 = stats['error_rate_sd']['read2'][l['lane']]
        err_str_r1 = str(err_r1) + '+/-' + str(err_sd_r1)
        err_str_r2 = str(err_r2) + '+/-' + str(err_sd_r2)
        
        comm_r1 = ""
        comm_r2 = ""

        # check criteria
        if float(clu_dens_pf_r1[:-1]) < min_clupf: 
            ok_r1 = False
            ok_cludens_r1 = False
            comm_r1 += "Low cluster density. "
        if float(clu_dens_pf_r2[:-1]) < min_clupf: 
            ok_r2 = False
            ok_cludens_r2 = False
            comm_r2 += "Low cluster density. "
        avg_error_rate = (float(err_r1) + float(err_r2))/2
        if avg_error_rate > max_mean_err:
            ok_err_rate = False
        if float(err_r1) > max_mean_err:
            comm_r1 += "High error rate. "
            ok_err_r1 = False
        if float(err_r2) > max_mean_err:
            comm_r2 += "High error rate. "
            ok_err_r2 = False

        if comm_r1 == "": comm_r1 = "OK"        
        if comm_r2 == "": comm_r2 = "OK"

        tab_r1.add_row([l['lane'], clu_dens_string_r1, prc_pf_string_r1, clu_dens_pf_string_r1, phas_string_r1, aln_string_r1, err_str_r1, comm_r1])
        tab_r2.add_row([l['lane'], clu_dens_string_r2, prc_pf_string_r2, clu_dens_pf_string_r2, phas_string_r2, aln_string_r2, err_str_r2, comm_r2])

    # Reinitialize comments for the summary. (Which will be for several lanes, potentially)
    comm_r1 = ""
    comm_r2 = ""
 
    # if not ok_cludens_r1: comm_r1 += "Low cluster density. " 
    # if not ok_cludens_r2: comm_r2 += "Low cluster density. " 
    if not ok_err_rate:
        if not ok_err_r1: 
            ok_r1 = False
            comm_r1 += "High error rate. "
        if not ok_err_r2: 
            ok_r2 = False
            comm_r2 += "High error rate. "

    if (ok_r1 and ok_r2): 
        comm_r1 = comm_r2 = "OK"
        d.update(summary = "Successful run in terms of error rate. ")
    else:  
        if (ok_r1): 
            comm_r1 = "OK"
            d.update (summary = "Read 2 did not pass quality criteria: " + comm_r2)
        elif (ok_r2):
            comm_r2 = "OK"
            d.update (summary = "Read 1 did not pass quality criteria: " + comm_r1)
        else:
            d.update (summary = "Did not pass quality criteria. Read 1: " + comm_r1 + " Read 2: " + comm_r2)


    d.update(read1table=tab_r1.draw())
    d.update(read2table=tab_r2.draw())
        
    ## qcplots
    byCycleDir = os.path.join(proj_conf['archive_dir'], proj_conf['flowcell'], "Data", "reports", "ByCycle")
    res = []
    for l in proj_conf['lanes']:
        res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "QScore_L%s.png" % (l['lane']))), width="100%"))
    d.update(qcplots= "\n".join(res))

    ## qc30plots
    res = []
    for l in proj_conf['lanes']:
        res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "NumGT30_L%s.png" % (l['lane']))), width="100%"))
    d.update(qc30plots= "\n".join(res))

    ## qcplots
    res = []
    for l in proj_conf['lanes']:
        res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "ErrRate_L%s.png" % (l['lane']))), width="100%"))
    d.update(errorrate= "\n".join(res))

    ## Sequence yield table
    target_yield_per_lane = 143000000.0
    if (options.v1_5_fc):  target_yield_per_lane = 60000000.0
    tab = Texttable()
    tab.add_row(['Lane','Sample','Number of sequences','Million sequences ordered','Comment'])
    
    run_info_yaml = os.path.join(proj_conf['archive_dir'],proj_conf['flowcell'],"run_info.yaml")

    if not os.path.exists(run_info_yaml):
        print("WARNING: could not find required run_info.yaml configuration file at '%s'" % run_info_yaml)
        return

    with open(run_info_yaml) as in_handle:
        run_info = yaml.load(in_handle)

    fc_name, fc_date = get_flowcell_info(proj_conf['flowcell'])
    low_yield = False
    
    bc_multiplier = 0.75 # Should move to cfg file

    ok_samples = []
    low_samples = []

    for l in proj_conf['lanes']:
        bc_file_name = os.path.join(proj_conf['analysis_dir'], proj_conf['flowcell'], '_'.join([l['lane'], fc_date, fc_name, "nophix_barcode"]), '_'.join([l['lane'], fc_date, fc_name, "nophix_bc.metrics"]))
        try:
            bc_file = open(bc_file_name)
        except:
            sys.exit("Could not find bc metrics file " + bc_file_name)
        bc_count = {}
        for line in bc_file:
            c = line.strip().split()
            bc_count[c[0]]=c[1] + ' (~' + str (int ( round (float(c[1])/1000000) ) ) + " million)"
        no_samples = len(bc_count)
        if no_samples == 0:
            print("WARNING: did not find a BC metrics file... Skipping lane %s for %s" %(l['lane'], proj_conf['id']))
            continue
        
        target_yield_per_sample = ''
        try:
            min_reads_per_sample = round(float(str(min_reads_per_sample)))
            target_yield_per_sample = min_reads_per_sample * 1000000
        except ValueError:
            min_reads_per_sample = ''
            target_yield_per_sample = bc_multiplier * target_yield_per_lane / no_samples
            
        sample_name = {}
        is_multiplexed = True
        is_rerun = False
        # Check here for each sample if it belongs to the project
        for entry in run_info:
            if entry['lane'] == l['lane']:
                projs = set()
                if entry.has_key('multiplex'):
                    for sample in entry['multiplex']:
                        if sample.has_key('sample_prj'):
                            projs.add(sample['sample_prj'])
                            if sample['sample_prj'].strip() == proj_conf['id']:
                                sample_name[sample['barcode_id']]=sample['name']
                else: is_multiplexed = False
                if len(projs) > 1: is_rerun = True
        samp_count = {}

        for k in bc_count.keys():
            if not k.isdigit(): pass
            else: 
                if sample_name.has_key(int(k)): samp_count[sample_name[int(k)]] =  bc_count[k]

        for k in sorted(samp_count.keys()):
            comment = ''
            if int(samp_count[k].split('(')[0]) < target_yield_per_sample: 
                comment = 'Low. '
                low_yield = True
                low_samples.append(k)
            else: ok_samples.append(k)
            if is_rerun: comment += '(rerun lane)'
            tab.add_row([l['lane'], k, samp_count[k], min_reads_per_sample, comment])
        
        if is_multiplexed:
            comment = ''
            try:
                if int (bc_count['unmatched'].split('(')[0]) > target_yield_per_sample: comment = 'High.'
                if is_rerun: comment += '(rerun lane)'
                tab.add_row([l['lane'], 'unmatched', bc_count['unmatched'], min_reads_per_sample, comment])
            except:
                print('WARNING: insufficient or no barcode metrics for lane')
        else:
            comment = ''
            for k in bc_count.keys():
                if int (bc_count[k].split('(')[0]) < bc_multiplier * target_yield_per_lane: comment = 'Low.' 
                tab.add_row([l['lane'], "Non-multiplexed lane", bc_count[k], min_reads_per_sample, comment])

    delivery_type = "Final delivery. "
    if low_yield:
        delivery_type = "Partial delivery. "
        fail_comm = "Samples " + ", ".join(low_samples) + " yielded fewer sequences than expected. These will be re-run unless this was already a re-run and the total yield is now sufficient. "
    else: fail_comm = ""

    if low_yield: 
        if len(ok_samples)>0: ok_comm = "Samples " + ", ".join(ok_samples) + " yielded the expected number of sequences or more. "
        else: ok_comm = ""
    else: ok_comm = "All samples yielded the expected number of sequences or more. "

    comm = d['summary'] + fail_comm + ok_comm
    d.update(summary = comm)

    d.update(yieldtable=tab.draw())
    return d
예제 #4
0
def generate_report(proj_conf):

    #######
    ### Metadata fetched from the 'Genomics project list' on Google Docs
    ###
    uppnex_proj = ''
    min_reads_per_sample = ''
    try:
        proj_data = ProjectMetaData(proj_conf['id'], proj_conf['config'])
        uppnex_proj = proj_data.uppnex_id
        project_id = proj_data.project_id
        queue_date = proj_data.queue_date
        no_samples = proj_data.no_samples
        lanes_plates = proj_data.lanes_plates
        min_reads_per_sample = proj_data.min_reads_per_sample
        customer_reference = proj_data.customer_reference
        application = proj_data.application
        no_finished_samples = proj_data.no_finished_samples
    except:
        print("WARNING: Could not fetch meta data from Google Docs")

    d = {
        'project_id': proj_conf['id'],
        'latex_opt': "",
        'summary': "",
        'infotable': "",
        'lanetable': "",
        'read1table': "",
        'read2table': "",
        'qcplots': "",
        'qc30plots': "",
        'errorrate': "",
        'yieldtable': "",
        'qualscale': proj_conf['qual_scale'],
        }

    ## Latex option (no of floats per page)
    floats_per_page = '.. raw:: latex\n\n   \setcounter{totalnumber}{8}'
    d.update(latex_opt=floats_per_page)

    ## General info table
    tab = Texttable()
    if not uppnex_proj or len(uppnex_proj) < 4 or uppnex_proj[0:4] != 'b201':
        uppnex_proj = "b201YXXX"
        print "WARNING: Could not find UPPNEX project"

    run_name_comp = proj_conf['flowcell'].split('_')
    simple_run_name = run_name_comp[0] + "_" + run_name_comp[3]
    proj_level_dir = fixProjName(proj_conf['id'])
    instr_id = run_name_comp[1]
    fc_name, fc_date = get_flowcell_info(proj_conf['flowcell'])
    tab.add_row(["Run name:", proj_conf['flowcell']])
    del_base = "/proj/"
    proj_id = proj_conf['id']
    try:
        if len(customer_reference) > 1:
            proj_id += ' (' + customer_reference + ')'
    except:
        pass

    if len(proj_id) > 30: 
        print "Project ID + customer reference too long: ", proj_id
    tab.add_rows([["Project id:", proj_id], 
                  ["Date:", fc_date],
                  ["Instrument ID:", instr_id],
                  ["Flow cell ID:", fc_name],
                  ["Uppnex project:", uppnex_proj],
                  ["Delivery directory:", del_base + uppnex_proj + "/INBOX/" + proj_level_dir + "/" + simple_run_name]])
    d.update(infotable=tab.draw())

    ## Lane table
    tab = Texttable()
    tab.add_row(["Lane", "Sample(s)"])
    for l in proj_conf['lanes']:
        main_proj = l['description'].split(',')[1].strip()
        samples = []
        if 'multiplex' in l:
            for mp in l['multiplex']:
                if 'sample_prj' in mp:
                    if mp['sample_prj'] == proj_conf['id']:
                        samples.append(mp['name'])
            tab.add_row([l['lane'], ", ".join(samples)])
        else:
            tab.add_row([l['lane'], "Non-multiplexed lane"])
    d.update(lanetable=tab.draw())

    tab_r1 = Texttable()
    tab_r2 = Texttable()
    tab_r1.set_cols_width([2, 12, 12, 12, 12, 12, 12, 30])
    tab_r2.set_cols_width([2, 12, 12, 12, 12, 12, 12, 30])
    tab_r1.add_row(["Lane", "Clu. dens. #/mm2", "% PF clusters", "Clu. PF #/mm2", "% phas/prephas", "% aln PhiX", "% error rate", "Comment"])
    tab_r2.add_row(["Lane", "Clu. dens. #/mm2", "% PF clusters", "Clu. PF #/mm2", "% phas/prephas", "% aln PhiX", "% error rate", "Comment"])

    # These should be moved to a cfg file. ( + perhaps provide an alternative for v1.5 FC )
    if (options.v1_5_fc):
        min_clupf = 300
    else:
        min_clupf = 475
    max_phas = 0.4
    max_prephas = 1.0  # 0.5
    max_mean_err = 2

    statspath = os.path.join(proj_conf['archive_dir'], proj_conf['flowcell'], "Data", "reports", "Summary")
    stats = summ.getQCstats(statspath)

    # Check quality criteria and add comments
    comm_r1 = ''
    comm_r2 = ''
    ok_r1 = True
    ok_r2 = True
    ok_cludens_r1 = True
    ok_cludens_r2 = True
    ok_err_rate = True
    ok_err_r1 = True
    ok_err_r2 = True

    for l in proj_conf['lanes']:

        # Cluster densities
        clu_dens_r1 = stats['raw_cluster_dens']['read1'][l['lane']]
        clu_dens_r2 = stats['raw_cluster_dens']['read2'][l['lane']]
        clu_dens_sd_r1 = stats['raw_cluster_dens_sd']['read1'][l['lane']]
        clu_dens_sd_r2 = stats['raw_cluster_dens_sd']['read2'][l['lane']]
        clu_dens_string_r1 = str(clu_dens_r1) + '+/-' + str(clu_dens_sd_r1)
        clu_dens_string_r2 = str(clu_dens_r2) + '+/-' + str(clu_dens_sd_r2)

        # Cluster PF densities
        clu_dens_pf_r1 = stats['pf_cluster_dens']['read1'][l['lane']]
        clu_dens_pf_r2 = stats['pf_cluster_dens']['read2'][l['lane']]
        clu_dens_pf_sd_r1 = stats['pf_cluster_dens_sd']['read1'][l['lane']]
        clu_dens_pf_sd_r2 = stats['pf_cluster_dens_sd']['read2'][l['lane']]
        clu_dens_pf_string_r1 = str(clu_dens_pf_r1) + '+/-' + str(clu_dens_pf_sd_r1)
        clu_dens_pf_string_r2 = str(clu_dens_pf_r2) + '+/-' + str(clu_dens_pf_sd_r2)

        # % PF clusters
        prc_pf_r1 = stats['prc_pf']['read1'][l['lane']]
        prc_pf_r2 = stats['prc_pf']['read2'][l['lane']]
        prc_pf_sd_r1 = stats['prc_pf_sd']['read1'][l['lane']]
        prc_pf_sd_r2 = stats['prc_pf_sd']['read2'][l['lane']]
        prc_pf_string_r1 = str(prc_pf_r1) + '+/-' + str(prc_pf_sd_r1)
        prc_pf_string_r2 = str(prc_pf_r2) + '+/-' + str(prc_pf_sd_r2)

        # % phasing and prephasing
        phas_r1 = stats['phasing']['read1'][l['lane']]
        phas_r2 = stats['phasing']['read2'][l['lane']]
        prephas_r1 = stats['prephasing']['read1'][l['lane']]
        prephas_r2 = stats['prephasing']['read2'][l['lane']]
        phas_string_r1 = str(phas_r1) + '/' + str(prephas_r1)
        phas_string_r2 = str(phas_r2) + '/' + str(prephas_r2)

        # % aligned
        aln_r1 = stats['prc_aligned']['read1'][l['lane']]
        aln_r2 = stats['prc_aligned']['read2'][l['lane']]
        aln_sd_r1 = stats['prc_aligned_sd']['read1'][l['lane']]
        aln_sd_r2 = stats['prc_aligned_sd']['read2'][l['lane']]
        aln_string_r1 = str(aln_r1) + '+/-' + str(aln_sd_r1)
        aln_string_r2 = str(aln_r2) + '+/-' + str(aln_sd_r2)

        # error rate
        err_r1 = stats['error_rate']['read1'][l['lane']]
        err_r2 = stats['error_rate']['read2'][l['lane']]
        err_sd_r1 = stats['error_rate_sd']['read1'][l['lane']]
        err_sd_r2 = stats['error_rate_sd']['read2'][l['lane']]
        err_str_r1 = str(err_r1) + '+/-' + str(err_sd_r1)
        err_str_r2 = str(err_r2) + '+/-' + str(err_sd_r2)

        comm_r1 = ""
        comm_r2 = ""

        # check criteria
        if float(clu_dens_pf_r1[:-1]) < min_clupf:
            ok_r1 = False
            ok_cludens_r1 = False
            comm_r1 += "Low cluster density. "
        if float(clu_dens_pf_r2[:-1]) < min_clupf:
            ok_r2 = False
            ok_cludens_r2 = False
            comm_r2 += "Low cluster density. "
        avg_error_rate = (float(err_r1) + float(err_r2)) / 2
        if avg_error_rate > max_mean_err:
            ok_err_rate = False
        if float(err_r1) > max_mean_err:
            comm_r1 += "High error rate. "
            ok_err_r1 = False
        if float(err_r2) > max_mean_err:
            comm_r2 += "High error rate. "
            ok_err_r2 = False

        if comm_r1 == "":
            comm_r1 = "OK"
        if comm_r2 == "":
            comm_r2 = "OK"

        tab_r1.add_row([l['lane'], clu_dens_string_r1, prc_pf_string_r1, clu_dens_pf_string_r1, phas_string_r1, aln_string_r1, err_str_r1, comm_r1])
        tab_r2.add_row([l['lane'], clu_dens_string_r2, prc_pf_string_r2, clu_dens_pf_string_r2, phas_string_r2, aln_string_r2, err_str_r2, comm_r2])

    # Reinitialize comments for the summary. (Which will be for several lanes, potentially)
    comm_r1 = ""
    comm_r2 = ""

    if not ok_cludens_r1:
        comm_r1 += "Low cluster density. "
    if not ok_cludens_r2:
        comm_r2 += "Low cluster density. "
    if not ok_err_rate:
        if not ok_err_r1:
            ok_r1 = False
            comm_r1 += "High error rate. "
        if not ok_err_r2:
            ok_r2 = False
            comm_r2 += "High error rate. "

    if (ok_r1 and ok_r2):
        comm_r1 = comm_r2 = "OK"
        d.update(summary = "Successful run in terms of error rate. ")
    else:  
        if (ok_r1): 
            comm_r1 = "OK"
            d.update(summary="Read 2 did not pass quality criteria: " + comm_r2)
        elif (ok_r2):
            comm_r2 = "OK"
            d.update(summary="Read 1 did not pass quality criteria: " + comm_r1)
        else:
            d.update(summary="Did not pass quality criteria. Read 1: " + comm_r1 + " Read 2: " + comm_r2)

    d.update(read1table=tab_r1.draw())
    d.update(read2table=tab_r2.draw())

    ## qcplots
    byCycleDir = os.path.join(proj_conf['archive_dir'], proj_conf['flowcell'], "Data", "reports", "ByCycle")
    res = []
    for l in proj_conf['lanes']:
        res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "QScore_L%s.png" % (l['lane']))), width="100%"))
    d.update(qcplots="\n".join(res))

    ## qc30plots
    res = []
    for l in proj_conf['lanes']:
        res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "NumGT30_L%s.png" % (l['lane']))), width="100%"))
    d.update(qc30plots="\n".join(res))

    ## qcplots
    res = []
    for l in proj_conf['lanes']:
        res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "ErrRate_L%s.png" % (l['lane']))), width="100%"))
    d.update(errorrate="\n".join(res))

    ## Sequence yield table
    target_yield_per_lane = 143000000.0
    if (options.v1_5_fc):
        target_yield_per_lane = 60000000.0
    tab = Texttable()
    tab.add_row(['Lane', 'Sample', 'Number of sequences', 'Million sequences ordered', 'Comment'])

    run_info_yaml = os.path.join(proj_conf['archive_dir'], proj_conf['flowcell'], "run_info.yaml")

    if not os.path.exists(run_info_yaml):
        print("WARNING: could not find required run_info.yaml configuration file at '%s'" % run_info_yaml)
        return

    with open(run_info_yaml) as in_handle:
        run_info = yaml.load(in_handle)

    fc_name, fc_date = get_flowcell_info(proj_conf['flowcell'])
    low_yield = False

    bc_multiplier = 0.75  # Should move to cfg file

    ok_samples = []
    low_samples = []

    for l in proj_conf['lanes']:
	bc_file_name_prefix = os.path.join(proj_conf['analysis_dir'], proj_conf['flowcell'], '_'.join([l['lane'], fc_date, fc_name, "nophix_barcode"]), '_'.join([l['lane'], fc_date, fc_name, "nophix"]))
        bc_file = bc_file_name_prefix + ".bc_metrics"
	if not os.path.exists(bc_file):
		bc_file = bc_file_name_prefix + "_bc.metrics"
        try:
            bc_file = open(bc_file)
        except:
            sys.exit("Could not find bc metrics file " + bc_file)
        bc_count = {}
        for line in bc_file:
            c = line.strip().split()
            bc_count[c[0]]=c[1] + ' (~' + str (int ( round (float(c[1])/1000000) ) ) + " million)"
        no_samples = len(bc_count) - 1
        if no_samples == 0:
            print("WARNING: did not find a BC metrics file... Skipping lane %s for %s" % (l['lane'], proj_conf['id']))
            continue

        target_yield_per_sample = ''
        try:
            min_reads_per_sample = round(float(str(min_reads_per_sample)))
            target_yield_per_sample = min_reads_per_sample * 1000000
        except ValueError:
            min_reads_per_sample = ''
            target_yield_per_sample = bc_multiplier * target_yield_per_lane / no_samples

        sample_name = {}
        is_multiplexed = True
        is_rerun = False
        # Check here for each sample if it belongs to the project
        for entry in run_info:
            if entry['lane'] == l['lane']:
                projs = set()
                if 'multiplex' in entry:
                    for sample in entry['multiplex']:
                        if 'sample_prj' in sample:
                            projs.add(sample['sample_prj'])
                            if sample['sample_prj'].strip() == proj_conf['id']:
                                sample_name[sample['barcode_id']] = sample['name']
                else:
                    is_multiplexed = False
                if len(projs) > 1:
                    is_rerun = True
        samp_count = {}

        for k in bc_count.keys():
            if not k.isdigit():
                pass
            else:
                if int(k) in sample_name:
                    samp_count[sample_name[int(k)]] = bc_count[k]

        print "DEBUG: Target yield per sample = ", target_yield_per_sample
        print "DEBUG: Min reads per sample = ", min_reads_per_sample
        print "DEBUG: No samples: ", no_samples

        for k in sorted(samp_count.keys()):
            comment = ''
            if int(samp_count[k].split('(')[0]) < target_yield_per_sample:
                comment = 'Low. '
                low_yield = True
                low_samples.append(k)
            else:
                ok_samples.append(k)
            if is_rerun:
                comment += '(rerun lane)'
            tab.add_row([l['lane'], k, samp_count[k], min_reads_per_sample, comment])

        if is_multiplexed:
            comment = ''
            try:
                if int(bc_count['unmatched'].split('(')[0]) > target_yield_per_sample:
                    comment = 'High.'
                if is_rerun:
                    comment += '(rerun lane)'
                tab.add_row([l['lane'], 'unmatched', bc_count['unmatched'], min_reads_per_sample, comment])
            except:
                print('WARNING: insufficient or no barcode metrics for lane')
        else:
            comment = ''
            for k in bc_count.keys():
                if int(bc_count[k].split('(')[0]) < bc_multiplier * target_yield_per_lane:
                    comment = 'Low.'
                tab.add_row([l['lane'], "Non-multiplexed lane", bc_count[k], min_reads_per_sample, comment])

    delivery_type = "Final delivery. "
    if low_yield:
        delivery_type = "Partial delivery. "
        fail_comm = "Samples " + ", ".join(low_samples) + " yielded fewer sequences than expected. These will be re-run unless this was already a re-run and the total yield is now sufficient. "
    else:
        fail_comm = ""

    if low_yield:
        if len(ok_samples) > 0:
            ok_comm = "Samples " + ", ".join(ok_samples) + " yielded the expected number of sequences or more. "
        else:
            ok_comm = ""
    else:
        ok_comm = "All samples yielded the expected number of sequences or more. "

    comm = d['summary'] + fail_comm + ok_comm
    d.update(summary=comm)

    d.update(yieldtable=tab.draw())
    return d
예제 #5
0
def generate_report(proj_conf):

    d = {
        'runname': proj_conf['run'],
        'project_id': proj_conf['id'],
        'samplenames': ' '.join(proj_conf['samples']),
        'latex_opt': "",
        'uppnex': "",
        'mapping': "",
        'dup_rem': "",
        'read_count': "",
        'quantifyer': "",
        'gene_body_cov': "",
        'FPKM_heatmap': "",
        'FPKM_PCAplot': "",
        'Mapping_statistics': "",
        'Read_Distribution': "",
        'rRNA_table': ""
    }

    ## Latex option (no of floats per page)
    floats_per_page = '.. raw:: latex\n\n   \setcounter{totalnumber}{8}'
    d['latex_opt'] = floats_per_page

    ## Metadata fetched from the 'Genomics project list' on Google Docs
    try:
        proj_data = ProjectMetaData(proj_conf['id'], proj_conf['config'])
        uppnex_proj = proj_data.uppnex_id
    except:
        uppnex_proj = "b201YYXX"
        print "No uppnex ID fetched"
        pass
    if not uppnex_proj:
        uppnex_proj = "b201YYXX"
        print "No uppnex ID fetched"
    d['uppnex'] = uppnex_proj

    ## RNA-seq tools fetched from config file post_process.yaml
    try:
        tools = proj_conf['config']['custom_algorithms']['RNA-seq analysis']
        d['mapping'] = os.path.join(tools['aligner'], tools['aligner_version'])
        d['dup_rem'] = os.path.join(tools['dup_remover'],
                                    tools['dup_remover_version'])
        d['read_count'] = os.path.join(tools['counts'],
                                       tools['counts_version'])
        d['quantifyer'] = os.path.join(tools['quantifyer'],
                                       tools['quantifyer_version'])
    except:
        print "Could not fetched RNA-seq tools from config file post_process.yaml"
        d['mapping'] = "X"
        d['dup_rem'] = "X"
        d['read_count'] = "X"
        d['quantifyer'] = "X"
        pass

    ## Mapping Statistics
    tab = Texttable()
    tab.set_cols_dtype(['t', 't', 't', 't'])
    tab.add_row([
        'Sample', 'tot_#_read_pairs', '%_uniquely_mapped_reads',
        '%_uniquely_mapped_reads_left_after_dup_rem'
    ])
    try:
        for sample_name in proj_conf['samples']:
            f = open('tophat_out_' + sample_name + '/stat_' + sample_name, 'r')
            data = f.readlines()
            tab.add_row([
                sample_name, data[1].split()[1], data[2].split()[1],
                data[3].split()[1]
            ])
            f.close()
        d['Mapping_statistics'] = tab.draw()
    except:
        try:
            f = open('stat', 'r')
            data = f.readlines()
            D = dict(
                zip(data[0].split(),
                    zip(data[1].split(), data[2].split(), data[3].split())))
            for sample_name in proj_conf['samples']:
                if D.has_key(sample_name):
                    tab.add_row([
                        sample_name, D[sample_name][0], D[sample_name][1],
                        D[sample_name][2]
                    ])
                else:
                    print 'kould not find ' + sample_name + ' in stat'
            d['Mapping_statistics'] = tab.draw()
            f.close()
        except:
            print "Could not make Mapping Statistics table"
            pass

    ## Read Distribution
    try:
        tab = Texttable()
        json = open('Ever_rd.json', 'a')
        print >> json, '{'
        Groups = [
            "Sample:", "CDS Exons:", "5'UTR Exons:", "3'UTR Exons:",
            "Intronic region:", "TSS up 1kb:", "TES down 1kb:"
        ]

        tab.set_cols_dtype(['t', 't', 't', 't', 't', 't', 't', 't'])
        tab.add_row([
            "Sample", "CDS Exon", "5'UTR Exon", "3'UTR Exon", "Intron",
            "TSS up 1kb", "TES down 1kb", "mRNA frac"
        ])

        for i in range(len(proj_conf['samples'])):
            sample_name = proj_conf['samples'][i]
            print >> json, sample_name + ': {'
            row = [sample_name]
            Reads_counts = []
            try:
                f = open('RSeQC_rd_' + sample_name + '.err', 'r')
            except:
                f = open('Ever_rd_' + sample_name + '.err', 'r')
                pass
            for line in f:
                Group = line.split('\t')[0]
                if Group in Groups:
                    if Group == "TES down 1kb:":
                        print >> json, '"' + Group + '"' + ':' + str(
                            line.split('\t')[3].strip())
                    else:
                        print >> json, '"' + Group + '"' + ':' + str(
                            line.split('\t')[3].strip()) + ','
                    row.append(str(line.split('\t')[3].strip()) + ' ')
                    Reads_counts.append(float(line.split('\t')[2].strip()))
            if os.path.exists('RSeQC_rd_' + sample_name + '.err'):
                t = os.popen("grep 'Total Fragments' 'RSeQC_rd_" +
                             sample_name +
                             ".err'|sed 's/Total Fragments               //g'")
            else:
                try:
                    t = os.popen(
                        "grep 'Total Fragments' 'Ever_rd_" + sample_name +
                        ".err'|sed 's/Total Fragments               //g'")
                except:
                    pass
            tot = float(t.readline())
            frac = (Reads_counts[0] + Reads_counts[1] + Reads_counts[2]) / tot
            row.append(
                str(
                    round(
                        (Reads_counts[0] + Reads_counts[1] + Reads_counts[2]) /
                        tot, 2)))
            tab.add_row(row)
            f.close()
            if i == (len(proj_conf['samples']) - 1):
                print >> json, '}'
            else:
                print >> json, '},'
        print >> json, '}'
        json.close()
        d['Read_Distribution'] = tab.draw()

    except:
        print "Could not make Read Distribution table"
        pass

    ## FPKM_PCAplot, FPKM_heatmap
    if os.path.exists("FPKM_PCAplot.pdf") and os.path.exists(
            "FPKM_heatmap.pdf"):
        d['FPKM_PCAplot'] = m2r.image("FPKM_PCAplot.pdf", width="100%")
        d['FPKM_heatmap'] = m2r.image("FPKM_heatmap.pdf", width="100%")
    else:
        print "could not make FPKM PCAplot and FPKM heatmap"

    ## rRNA_table
    try:
        tab = Texttable()
        tab.set_cols_dtype(['t', 't'])
        tab.add_row(["Sample", "rRNA"])
        f = open('rRNA.quantification', 'r')
        D = {}
        for line in f:
            D[str(line.split('\t')[0].strip())] = str(
                line.split('\t')[1].strip())
        for sample_name in proj_conf['samples']:
            if D.has_key(sample_name):
                tab.add_row([sample_name, D[sample_name]])
        d['rRNA_table'] = tab.draw()
        f.close()
    except:
        print "could not generate rRNA table"
        pass

    return d