def generate_report(proj_conf): d = { 'project_id' : proj_conf['id'], 'infotable' : "", 'lanetable' : "", 'read1table': "", 'read2table': "", 'qcplots': "", 'qc30plots': "", 'errorrate': "", } ## General info table tab = Texttable() tab.add_row(["Project id", proj_conf['id']]) tab.add_rows([["Run name:", proj_conf['flowcell']], ["Uppnex project", ""]]) d.update(infotable=tab.draw()) ## Lane table tab = Texttable() tab.add_row(["Lane", "Sample(s)", "Conc. (pM)"]) for l in proj_conf['lanes']: samples = [] for mp in l['multiplex']: samples.append(mp['name']) tab.add_row([l['lane'], ", ".join(samples), ""]) d.update(lanetable=tab.draw()) ## qcplots byCycleDir = os.path.join(proj_conf['archive_dir'], proj_conf['flowcell'], "Data", "reports", "ByCycle") res = [] for l in proj_conf['lanes']: res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "QScore_L%s.png" % (l['lane']))), width="100%")) d.update(qcplots= "\n".join(res)) ## qc30plots res = [] for l in proj_conf['lanes']: res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "NumGT30_L%s.png" % (l['lane']))), width="100%")) d.update(qc30plots= "\n".join(res)) ## qcplots res = [] for l in proj_conf['lanes']: res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "ErrRate_L%s.png" % (l['lane']))), width="100%")) d.update(errorrate= "\n".join(res)) return d
def generate_report(proj_conf): d = { 'runname':proj_conf['run'], 'project_id': proj_conf['id'], 'samplenames': ' '.join(proj_conf['samples']), 'latex_opt' : "", 'uppnex': "", 'mapping':"", 'dup_rem':"", 'read_count':"", 'quantifyer':"", 'gene_body_cov':"", 'FPKM_heatmap':"", 'FPKM_PCAplot':"", 'Mapping_statistics': "", 'Read_Distribution':"", 'rRNA_table':"" } ## Latex option (no of floats per page) floats_per_page = '.. raw:: latex\n\n \setcounter{totalnumber}{8}' d['latex_opt'] = floats_per_page ## Metadata fetched from the 'Genomics project list' on Google Docs try: proj_data = ProjectMetaData(proj_conf['id'], proj_conf['config']) uppnex_proj = proj_data.uppnex_id except: uppnex_proj = "b201YYXX" print "No uppnex ID fetched" pass if not uppnex_proj: uppnex_proj="b201YYXX" print "No uppnex ID fetched" d['uppnex'] = uppnex_proj ## RNA-seq tools fetched from config file post_process.yaml try: tools = proj_conf['config']['custom_algorithms']['RNA-seq analysis'] d['mapping'] = os.path.join(tools['aligner'],tools['aligner_version']) d['dup_rem'] = os.path.join(tools['dup_remover'],tools['dup_remover_version']) d['read_count'] = os.path.join(tools['counts'],tools['counts_version']) d['quantifyer'] = os.path.join(tools['quantifyer'],tools['quantifyer_version']) except: print "Could not fetched RNA-seq tools from config file post_process.yaml" d['mapping'] = "X" d['dup_rem'] = "X" d['read_count'] = "X" d['quantifyer'] = "X" pass ## Mapping Statistics tab = Texttable() tab.set_cols_dtype(['t','t','t','t']) tab.add_row(['Sample','tot_#_read_pairs','%_uniquely_mapped_reads','%_uniquely_mapped_reads_left_after_dup_rem']) try: for sample_name in proj_conf['samples']: f=open('tophat_out_'+sample_name+'/stat_'+sample_name, 'r') data = f.readlines() tab.add_row([sample_name,data[1].split()[1],data[2].split()[1],data[3].split()[1]]) f.close() d['Mapping_statistics']=tab.draw() except: try: f=open('stat', 'r') data = f.readlines() D=dict(zip(data[0].split(),zip(data[1].split(),data[2].split(),data[3].split()))) for sample_name in proj_conf['samples']: if D.has_key(sample_name): tab.add_row([sample_name,D[sample_name][0],D[sample_name][1],D[sample_name][2]]) else: print 'kould not find '+sample_name+' in stat' d['Mapping_statistics']=tab.draw() f.close() except: print "Could not make Mapping Statistics table" pass ## Read Distribution try: tab = Texttable() json=open('Ever_rd.json','a') print >> json, '{' Groups=["Sample:","CDS Exons:","5'UTR Exons:","3'UTR Exons:","Intronic region:","TSS up 1kb:","TES down 1kb:"] tab.set_cols_dtype(['t','t','t','t','t','t','t','t']) tab.add_row(["Sample","CDS Exon","5'UTR Exon","3'UTR Exon","Intron","TSS up 1kb","TES down 1kb","mRNA frac"]) for i in range(len(proj_conf['samples'])): sample_name=proj_conf['samples'][i] print >> json, sample_name+': {' row=[sample_name] Reads_counts=[] try: f=open('RSeQC_rd_'+sample_name+'.err','r') except: f=open('Ever_rd_'+sample_name+'.err','r') pass for line in f: Group=line.split('\t')[0] if Group in Groups: if Group=="TES down 1kb:": print >> json, '"'+Group+'"'+':'+str(line.split('\t')[3].strip()) else: print >> json, '"'+Group+'"'+':'+str(line.split('\t')[3].strip())+',' row.append(str(line.split('\t')[3].strip())+' ') Reads_counts.append(float(line.split('\t')[2].strip())) if os.path.exists('RSeQC_rd_'+sample_name+'.err'): t=os.popen("grep 'Total Fragments' 'RSeQC_rd_"+sample_name+".err'|sed 's/Total Fragments //g'") else: try: t=os.popen("grep 'Total Fragments' 'Ever_rd_"+sample_name+".err'|sed 's/Total Fragments //g'") except: pass tot=float(t.readline()) frac=(Reads_counts[0]+Reads_counts[1]+Reads_counts[2])/tot row.append(str(round((Reads_counts[0]+Reads_counts[1]+Reads_counts[2])/tot,2))) tab.add_row(row) f.close() if i==(len(proj_conf['samples'])-1): print >> json,'}' else: print >> json,'},' print >> json, '}' json.close() d['Read_Distribution']=tab.draw() except: print "Could not make Read Distribution table" pass ## FPKM_PCAplot, FPKM_heatmap if os.path.exists("FPKM_PCAplot.pdf") and os.path.exists("FPKM_heatmap.pdf"): d['FPKM_PCAplot'] = m2r.image("FPKM_PCAplot.pdf", width="100%") d['FPKM_heatmap'] = m2r.image("FPKM_heatmap.pdf", width="100%") else: print "could not make FPKM PCAplot and FPKM heatmap" ## rRNA_table try: tab = Texttable() tab.set_cols_dtype(['t','t']) tab.add_row(["Sample","rRNA"]) f=open('rRNA.quantification','r') D={} for line in f: D[str(line.split('\t')[0].strip())]=str(line.split('\t')[1].strip()) for sample_name in proj_conf['samples']: if D.has_key(sample_name): tab.add_row([sample_name,D[sample_name]]) d['rRNA_table']=tab.draw() f.close() except: print "could not generate rRNA table" pass return d
def generate_report(proj_conf): ####### ### Metadata fetched from the 'Genomics project list' on Google Docs ### uppnex_proj = '' min_reads_per_sample = '' try: proj_data = ProjectMetaData(proj_conf['id'], proj_conf['config']) uppnex_proj = proj_data.uppnex_id project_id = proj_data.project_id queue_date = proj_data.queue_date no_samples = proj_data.no_samples lanes_plates = proj_data.lanes_plates min_reads_per_sample = proj_data.min_reads_per_sample customer_reference = proj_data.customer_reference application = proj_data.application no_finished_samples = proj_data.no_finished_samples except: print("WARNING: Could not fetch meta data from Google Docs") d = { 'project_id' : proj_conf['id'], 'latex_opt' : "", 'summary' : "", 'infotable' : "", 'lanetable' : "", 'read1table': "", 'read2table': "", 'qcplots': "", 'qc30plots': "", 'errorrate': "", 'yieldtable': "", } ## Latex option (no of floats per page) floats_per_page = '.. raw:: latex\n\n \setcounter{totalnumber}{8}' d.update(latex_opt = floats_per_page) ## General info table tab = Texttable() if not uppnex_proj or len(uppnex_proj) < 4 or uppnex_proj[0:4] != 'b201': uppnex_proj = "b201YXXX" print "WARNING: Could not find UPPNEX project" run_name_comp = proj_conf['flowcell'].split('_') simple_run_name = run_name_comp[0] + run_name_comp[3][0] proj_level_dir = fixProjName(proj_conf['id']) instr_id = run_name_comp[1] fc_name, fc_date = get_flowcell_info(proj_conf['flowcell']) tab.add_row(["Run name:", proj_conf['flowcell']]) del_base = "/proj/" proj_id = proj_conf['id'] try: if len(customer_reference) > 1: proj_id += ' (' + customer_reference + ')' except: pass tab.add_rows([["Project id:", proj_id], ["Date:", fc_date], ["Instrument ID:", instr_id], ["Flow cell ID:", fc_name], ["Uppnex project:", uppnex_proj], ["Delivery directory:", del_base + uppnex_proj + "/INBOX/" + proj_level_dir + "/" + proj_conf['flowcell']]]) d.update(infotable=tab.draw()) ## Lane table tab = Texttable() tab.add_row(["Lane", "Sample(s)"]) for l in proj_conf['lanes']: main_proj = l['description'].split(',')[1].strip() samples = [] if l.has_key('multiplex'): for mp in l['multiplex']: if mp.has_key('sample_prj'): if mp['sample_prj'] == proj_conf['id']: samples.append(mp['name']) tab.add_row([l['lane'], ", ".join(samples)]) else: tab.add_row([l['lane'], "Non-multiplexed lane"]) d.update(lanetable=tab.draw()) tab_r1 = Texttable() tab_r2 = Texttable() tab_r1.set_cols_width([2,12,12,12,12,12,12,30]) tab_r2.set_cols_width([2,12,12,12,12,12,12,30]) tab_r1.add_row(["Lane", "Clu. dens. #/mm2","% PF clusters","Clu. PF #/mm2", "% phas/prephas", "% aln PhiX", "% error rate", "Comment"]) tab_r2.add_row(["Lane", "Clu. dens. #/mm2","% PF clusters","Clu. PF #/mm2", "% phas/prephas", "% aln PhiX", "% error rate", "Comment"]) # These should be moved to a cfg file. ( + perhaps provide an alternative for v1.5 FC ) if (options.v1_5_fc): min_clupf = 300 else: min_clupf = 475 max_phas = 0.4 max_prephas = 1.0 # 0.5 max_mean_err = 2 statspath = os.path.join(proj_conf['archive_dir'], proj_conf['flowcell'], "Data", "reports", "Summary") stats = summ.getQCstats(statspath) # Check quality criteria and add comments comm_r1 = '' comm_r2 = '' ok_r1 = True ok_r2 = True ok_cludens_r1 = True ok_cludens_r2 = True ok_err_rate = True ok_err_r1 = True ok_err_r2 = True for l in proj_conf['lanes']: # Cluster densities clu_dens_r1 = stats['raw_cluster_dens']['read1'][l['lane']] clu_dens_r2 = stats['raw_cluster_dens']['read2'][l['lane']] clu_dens_sd_r1 = stats['raw_cluster_dens_sd']['read1'][l['lane']] clu_dens_sd_r2 = stats['raw_cluster_dens_sd']['read2'][l['lane']] clu_dens_string_r1 = str(clu_dens_r1) + '+/-' + str(clu_dens_sd_r1) clu_dens_string_r2 = str(clu_dens_r2) + '+/-' + str(clu_dens_sd_r2) # Cluster PF densities clu_dens_pf_r1 = stats['pf_cluster_dens']['read1'][l['lane']] clu_dens_pf_r2 = stats['pf_cluster_dens']['read2'][l['lane']] clu_dens_pf_sd_r1 = stats['pf_cluster_dens_sd']['read1'][l['lane']] clu_dens_pf_sd_r2 = stats['pf_cluster_dens_sd']['read2'][l['lane']] clu_dens_pf_string_r1 = str(clu_dens_pf_r1) + '+/-' + str(clu_dens_pf_sd_r1) clu_dens_pf_string_r2 = str(clu_dens_pf_r2) + '+/-' + str(clu_dens_pf_sd_r2) # % PF clusters prc_pf_r1 = stats['prc_pf']['read1'][l['lane']] prc_pf_r2 = stats['prc_pf']['read2'][l['lane']] prc_pf_sd_r1 = stats['prc_pf_sd']['read1'][l['lane']] prc_pf_sd_r2 = stats['prc_pf_sd']['read2'][l['lane']] prc_pf_string_r1 = str(prc_pf_r1) + '+/-' + str(prc_pf_sd_r1) prc_pf_string_r2 = str(prc_pf_r2) + '+/-' + str(prc_pf_sd_r2) # % phasing and prephasing phas_r1 = stats['phasing']['read1'][l['lane']] phas_r2 = stats['phasing']['read2'][l['lane']] prephas_r1 = stats['prephasing']['read1'][l['lane']] prephas_r2 = stats['prephasing']['read2'][l['lane']] phas_string_r1 = str(phas_r1) + '/' + str(prephas_r1) phas_string_r2 = str(phas_r2) + '/' + str(prephas_r2) # % aligned aln_r1 = stats['prc_aligned']['read1'][l['lane']] aln_r2 = stats['prc_aligned']['read2'][l['lane']] aln_sd_r1 = stats['prc_aligned_sd']['read1'][l['lane']] aln_sd_r2 = stats['prc_aligned_sd']['read2'][l['lane']] aln_string_r1 = str(aln_r1) + '+/-' + str(aln_sd_r1) aln_string_r2 = str(aln_r2) + '+/-' + str(aln_sd_r2) # error rate err_r1 = stats['error_rate']['read1'][l['lane']] err_r2 = stats['error_rate']['read2'][l['lane']] err_sd_r1 = stats['error_rate_sd']['read1'][l['lane']] err_sd_r2 = stats['error_rate_sd']['read2'][l['lane']] err_str_r1 = str(err_r1) + '+/-' + str(err_sd_r1) err_str_r2 = str(err_r2) + '+/-' + str(err_sd_r2) comm_r1 = "" comm_r2 = "" # check criteria if float(clu_dens_pf_r1[:-1]) < min_clupf: ok_r1 = False ok_cludens_r1 = False comm_r1 += "Low cluster density. " if float(clu_dens_pf_r2[:-1]) < min_clupf: ok_r2 = False ok_cludens_r2 = False comm_r2 += "Low cluster density. " avg_error_rate = (float(err_r1) + float(err_r2))/2 if avg_error_rate > max_mean_err: ok_err_rate = False if float(err_r1) > max_mean_err: comm_r1 += "High error rate. " ok_err_r1 = False if float(err_r2) > max_mean_err: comm_r2 += "High error rate. " ok_err_r2 = False if comm_r1 == "": comm_r1 = "OK" if comm_r2 == "": comm_r2 = "OK" tab_r1.add_row([l['lane'], clu_dens_string_r1, prc_pf_string_r1, clu_dens_pf_string_r1, phas_string_r1, aln_string_r1, err_str_r1, comm_r1]) tab_r2.add_row([l['lane'], clu_dens_string_r2, prc_pf_string_r2, clu_dens_pf_string_r2, phas_string_r2, aln_string_r2, err_str_r2, comm_r2]) # Reinitialize comments for the summary. (Which will be for several lanes, potentially) comm_r1 = "" comm_r2 = "" # if not ok_cludens_r1: comm_r1 += "Low cluster density. " # if not ok_cludens_r2: comm_r2 += "Low cluster density. " if not ok_err_rate: if not ok_err_r1: ok_r1 = False comm_r1 += "High error rate. " if not ok_err_r2: ok_r2 = False comm_r2 += "High error rate. " if (ok_r1 and ok_r2): comm_r1 = comm_r2 = "OK" d.update(summary = "Successful run in terms of error rate. ") else: if (ok_r1): comm_r1 = "OK" d.update (summary = "Read 2 did not pass quality criteria: " + comm_r2) elif (ok_r2): comm_r2 = "OK" d.update (summary = "Read 1 did not pass quality criteria: " + comm_r1) else: d.update (summary = "Did not pass quality criteria. Read 1: " + comm_r1 + " Read 2: " + comm_r2) d.update(read1table=tab_r1.draw()) d.update(read2table=tab_r2.draw()) ## qcplots byCycleDir = os.path.join(proj_conf['archive_dir'], proj_conf['flowcell'], "Data", "reports", "ByCycle") res = [] for l in proj_conf['lanes']: res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "QScore_L%s.png" % (l['lane']))), width="100%")) d.update(qcplots= "\n".join(res)) ## qc30plots res = [] for l in proj_conf['lanes']: res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "NumGT30_L%s.png" % (l['lane']))), width="100%")) d.update(qc30plots= "\n".join(res)) ## qcplots res = [] for l in proj_conf['lanes']: res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "ErrRate_L%s.png" % (l['lane']))), width="100%")) d.update(errorrate= "\n".join(res)) ## Sequence yield table target_yield_per_lane = 143000000.0 if (options.v1_5_fc): target_yield_per_lane = 60000000.0 tab = Texttable() tab.add_row(['Lane','Sample','Number of sequences','Million sequences ordered','Comment']) run_info_yaml = os.path.join(proj_conf['archive_dir'],proj_conf['flowcell'],"run_info.yaml") if not os.path.exists(run_info_yaml): print("WARNING: could not find required run_info.yaml configuration file at '%s'" % run_info_yaml) return with open(run_info_yaml) as in_handle: run_info = yaml.load(in_handle) fc_name, fc_date = get_flowcell_info(proj_conf['flowcell']) low_yield = False bc_multiplier = 0.75 # Should move to cfg file ok_samples = [] low_samples = [] for l in proj_conf['lanes']: bc_file_name = os.path.join(proj_conf['analysis_dir'], proj_conf['flowcell'], '_'.join([l['lane'], fc_date, fc_name, "nophix_barcode"]), '_'.join([l['lane'], fc_date, fc_name, "nophix_bc.metrics"])) try: bc_file = open(bc_file_name) except: sys.exit("Could not find bc metrics file " + bc_file_name) bc_count = {} for line in bc_file: c = line.strip().split() bc_count[c[0]]=c[1] + ' (~' + str (int ( round (float(c[1])/1000000) ) ) + " million)" no_samples = len(bc_count) if no_samples == 0: print("WARNING: did not find a BC metrics file... Skipping lane %s for %s" %(l['lane'], proj_conf['id'])) continue target_yield_per_sample = '' try: min_reads_per_sample = round(float(str(min_reads_per_sample))) target_yield_per_sample = min_reads_per_sample * 1000000 except ValueError: min_reads_per_sample = '' target_yield_per_sample = bc_multiplier * target_yield_per_lane / no_samples sample_name = {} is_multiplexed = True is_rerun = False # Check here for each sample if it belongs to the project for entry in run_info: if entry['lane'] == l['lane']: projs = set() if entry.has_key('multiplex'): for sample in entry['multiplex']: if sample.has_key('sample_prj'): projs.add(sample['sample_prj']) if sample['sample_prj'].strip() == proj_conf['id']: sample_name[sample['barcode_id']]=sample['name'] else: is_multiplexed = False if len(projs) > 1: is_rerun = True samp_count = {} for k in bc_count.keys(): if not k.isdigit(): pass else: if sample_name.has_key(int(k)): samp_count[sample_name[int(k)]] = bc_count[k] for k in sorted(samp_count.keys()): comment = '' if int(samp_count[k].split('(')[0]) < target_yield_per_sample: comment = 'Low. ' low_yield = True low_samples.append(k) else: ok_samples.append(k) if is_rerun: comment += '(rerun lane)' tab.add_row([l['lane'], k, samp_count[k], min_reads_per_sample, comment]) if is_multiplexed: comment = '' try: if int (bc_count['unmatched'].split('(')[0]) > target_yield_per_sample: comment = 'High.' if is_rerun: comment += '(rerun lane)' tab.add_row([l['lane'], 'unmatched', bc_count['unmatched'], min_reads_per_sample, comment]) except: print('WARNING: insufficient or no barcode metrics for lane') else: comment = '' for k in bc_count.keys(): if int (bc_count[k].split('(')[0]) < bc_multiplier * target_yield_per_lane: comment = 'Low.' tab.add_row([l['lane'], "Non-multiplexed lane", bc_count[k], min_reads_per_sample, comment]) delivery_type = "Final delivery. " if low_yield: delivery_type = "Partial delivery. " fail_comm = "Samples " + ", ".join(low_samples) + " yielded fewer sequences than expected. These will be re-run unless this was already a re-run and the total yield is now sufficient. " else: fail_comm = "" if low_yield: if len(ok_samples)>0: ok_comm = "Samples " + ", ".join(ok_samples) + " yielded the expected number of sequences or more. " else: ok_comm = "" else: ok_comm = "All samples yielded the expected number of sequences or more. " comm = d['summary'] + fail_comm + ok_comm d.update(summary = comm) d.update(yieldtable=tab.draw()) return d
def generate_report(proj_conf): ####### ### Metadata fetched from the 'Genomics project list' on Google Docs ### uppnex_proj = '' min_reads_per_sample = '' try: proj_data = ProjectMetaData(proj_conf['id'], proj_conf['config']) uppnex_proj = proj_data.uppnex_id project_id = proj_data.project_id queue_date = proj_data.queue_date no_samples = proj_data.no_samples lanes_plates = proj_data.lanes_plates min_reads_per_sample = proj_data.min_reads_per_sample customer_reference = proj_data.customer_reference application = proj_data.application no_finished_samples = proj_data.no_finished_samples except: print("WARNING: Could not fetch meta data from Google Docs") d = { 'project_id': proj_conf['id'], 'latex_opt': "", 'summary': "", 'infotable': "", 'lanetable': "", 'read1table': "", 'read2table': "", 'qcplots': "", 'qc30plots': "", 'errorrate': "", 'yieldtable': "", 'qualscale': proj_conf['qual_scale'], } ## Latex option (no of floats per page) floats_per_page = '.. raw:: latex\n\n \setcounter{totalnumber}{8}' d.update(latex_opt=floats_per_page) ## General info table tab = Texttable() if not uppnex_proj or len(uppnex_proj) < 4 or uppnex_proj[0:4] != 'b201': uppnex_proj = "b201YXXX" print "WARNING: Could not find UPPNEX project" run_name_comp = proj_conf['flowcell'].split('_') simple_run_name = run_name_comp[0] + "_" + run_name_comp[3] proj_level_dir = fixProjName(proj_conf['id']) instr_id = run_name_comp[1] fc_name, fc_date = get_flowcell_info(proj_conf['flowcell']) tab.add_row(["Run name:", proj_conf['flowcell']]) del_base = "/proj/" proj_id = proj_conf['id'] try: if len(customer_reference) > 1: proj_id += ' (' + customer_reference + ')' except: pass if len(proj_id) > 30: print "Project ID + customer reference too long: ", proj_id tab.add_rows([["Project id:", proj_id], ["Date:", fc_date], ["Instrument ID:", instr_id], ["Flow cell ID:", fc_name], ["Uppnex project:", uppnex_proj], ["Delivery directory:", del_base + uppnex_proj + "/INBOX/" + proj_level_dir + "/" + simple_run_name]]) d.update(infotable=tab.draw()) ## Lane table tab = Texttable() tab.add_row(["Lane", "Sample(s)"]) for l in proj_conf['lanes']: main_proj = l['description'].split(',')[1].strip() samples = [] if 'multiplex' in l: for mp in l['multiplex']: if 'sample_prj' in mp: if mp['sample_prj'] == proj_conf['id']: samples.append(mp['name']) tab.add_row([l['lane'], ", ".join(samples)]) else: tab.add_row([l['lane'], "Non-multiplexed lane"]) d.update(lanetable=tab.draw()) tab_r1 = Texttable() tab_r2 = Texttable() tab_r1.set_cols_width([2, 12, 12, 12, 12, 12, 12, 30]) tab_r2.set_cols_width([2, 12, 12, 12, 12, 12, 12, 30]) tab_r1.add_row(["Lane", "Clu. dens. #/mm2", "% PF clusters", "Clu. PF #/mm2", "% phas/prephas", "% aln PhiX", "% error rate", "Comment"]) tab_r2.add_row(["Lane", "Clu. dens. #/mm2", "% PF clusters", "Clu. PF #/mm2", "% phas/prephas", "% aln PhiX", "% error rate", "Comment"]) # These should be moved to a cfg file. ( + perhaps provide an alternative for v1.5 FC ) if (options.v1_5_fc): min_clupf = 300 else: min_clupf = 475 max_phas = 0.4 max_prephas = 1.0 # 0.5 max_mean_err = 2 statspath = os.path.join(proj_conf['archive_dir'], proj_conf['flowcell'], "Data", "reports", "Summary") stats = summ.getQCstats(statspath) # Check quality criteria and add comments comm_r1 = '' comm_r2 = '' ok_r1 = True ok_r2 = True ok_cludens_r1 = True ok_cludens_r2 = True ok_err_rate = True ok_err_r1 = True ok_err_r2 = True for l in proj_conf['lanes']: # Cluster densities clu_dens_r1 = stats['raw_cluster_dens']['read1'][l['lane']] clu_dens_r2 = stats['raw_cluster_dens']['read2'][l['lane']] clu_dens_sd_r1 = stats['raw_cluster_dens_sd']['read1'][l['lane']] clu_dens_sd_r2 = stats['raw_cluster_dens_sd']['read2'][l['lane']] clu_dens_string_r1 = str(clu_dens_r1) + '+/-' + str(clu_dens_sd_r1) clu_dens_string_r2 = str(clu_dens_r2) + '+/-' + str(clu_dens_sd_r2) # Cluster PF densities clu_dens_pf_r1 = stats['pf_cluster_dens']['read1'][l['lane']] clu_dens_pf_r2 = stats['pf_cluster_dens']['read2'][l['lane']] clu_dens_pf_sd_r1 = stats['pf_cluster_dens_sd']['read1'][l['lane']] clu_dens_pf_sd_r2 = stats['pf_cluster_dens_sd']['read2'][l['lane']] clu_dens_pf_string_r1 = str(clu_dens_pf_r1) + '+/-' + str(clu_dens_pf_sd_r1) clu_dens_pf_string_r2 = str(clu_dens_pf_r2) + '+/-' + str(clu_dens_pf_sd_r2) # % PF clusters prc_pf_r1 = stats['prc_pf']['read1'][l['lane']] prc_pf_r2 = stats['prc_pf']['read2'][l['lane']] prc_pf_sd_r1 = stats['prc_pf_sd']['read1'][l['lane']] prc_pf_sd_r2 = stats['prc_pf_sd']['read2'][l['lane']] prc_pf_string_r1 = str(prc_pf_r1) + '+/-' + str(prc_pf_sd_r1) prc_pf_string_r2 = str(prc_pf_r2) + '+/-' + str(prc_pf_sd_r2) # % phasing and prephasing phas_r1 = stats['phasing']['read1'][l['lane']] phas_r2 = stats['phasing']['read2'][l['lane']] prephas_r1 = stats['prephasing']['read1'][l['lane']] prephas_r2 = stats['prephasing']['read2'][l['lane']] phas_string_r1 = str(phas_r1) + '/' + str(prephas_r1) phas_string_r2 = str(phas_r2) + '/' + str(prephas_r2) # % aligned aln_r1 = stats['prc_aligned']['read1'][l['lane']] aln_r2 = stats['prc_aligned']['read2'][l['lane']] aln_sd_r1 = stats['prc_aligned_sd']['read1'][l['lane']] aln_sd_r2 = stats['prc_aligned_sd']['read2'][l['lane']] aln_string_r1 = str(aln_r1) + '+/-' + str(aln_sd_r1) aln_string_r2 = str(aln_r2) + '+/-' + str(aln_sd_r2) # error rate err_r1 = stats['error_rate']['read1'][l['lane']] err_r2 = stats['error_rate']['read2'][l['lane']] err_sd_r1 = stats['error_rate_sd']['read1'][l['lane']] err_sd_r2 = stats['error_rate_sd']['read2'][l['lane']] err_str_r1 = str(err_r1) + '+/-' + str(err_sd_r1) err_str_r2 = str(err_r2) + '+/-' + str(err_sd_r2) comm_r1 = "" comm_r2 = "" # check criteria if float(clu_dens_pf_r1[:-1]) < min_clupf: ok_r1 = False ok_cludens_r1 = False comm_r1 += "Low cluster density. " if float(clu_dens_pf_r2[:-1]) < min_clupf: ok_r2 = False ok_cludens_r2 = False comm_r2 += "Low cluster density. " avg_error_rate = (float(err_r1) + float(err_r2)) / 2 if avg_error_rate > max_mean_err: ok_err_rate = False if float(err_r1) > max_mean_err: comm_r1 += "High error rate. " ok_err_r1 = False if float(err_r2) > max_mean_err: comm_r2 += "High error rate. " ok_err_r2 = False if comm_r1 == "": comm_r1 = "OK" if comm_r2 == "": comm_r2 = "OK" tab_r1.add_row([l['lane'], clu_dens_string_r1, prc_pf_string_r1, clu_dens_pf_string_r1, phas_string_r1, aln_string_r1, err_str_r1, comm_r1]) tab_r2.add_row([l['lane'], clu_dens_string_r2, prc_pf_string_r2, clu_dens_pf_string_r2, phas_string_r2, aln_string_r2, err_str_r2, comm_r2]) # Reinitialize comments for the summary. (Which will be for several lanes, potentially) comm_r1 = "" comm_r2 = "" if not ok_cludens_r1: comm_r1 += "Low cluster density. " if not ok_cludens_r2: comm_r2 += "Low cluster density. " if not ok_err_rate: if not ok_err_r1: ok_r1 = False comm_r1 += "High error rate. " if not ok_err_r2: ok_r2 = False comm_r2 += "High error rate. " if (ok_r1 and ok_r2): comm_r1 = comm_r2 = "OK" d.update(summary = "Successful run in terms of error rate. ") else: if (ok_r1): comm_r1 = "OK" d.update(summary="Read 2 did not pass quality criteria: " + comm_r2) elif (ok_r2): comm_r2 = "OK" d.update(summary="Read 1 did not pass quality criteria: " + comm_r1) else: d.update(summary="Did not pass quality criteria. Read 1: " + comm_r1 + " Read 2: " + comm_r2) d.update(read1table=tab_r1.draw()) d.update(read2table=tab_r2.draw()) ## qcplots byCycleDir = os.path.join(proj_conf['archive_dir'], proj_conf['flowcell'], "Data", "reports", "ByCycle") res = [] for l in proj_conf['lanes']: res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "QScore_L%s.png" % (l['lane']))), width="100%")) d.update(qcplots="\n".join(res)) ## qc30plots res = [] for l in proj_conf['lanes']: res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "NumGT30_L%s.png" % (l['lane']))), width="100%")) d.update(qc30plots="\n".join(res)) ## qcplots res = [] for l in proj_conf['lanes']: res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "ErrRate_L%s.png" % (l['lane']))), width="100%")) d.update(errorrate="\n".join(res)) ## Sequence yield table target_yield_per_lane = 143000000.0 if (options.v1_5_fc): target_yield_per_lane = 60000000.0 tab = Texttable() tab.add_row(['Lane', 'Sample', 'Number of sequences', 'Million sequences ordered', 'Comment']) run_info_yaml = os.path.join(proj_conf['archive_dir'], proj_conf['flowcell'], "run_info.yaml") if not os.path.exists(run_info_yaml): print("WARNING: could not find required run_info.yaml configuration file at '%s'" % run_info_yaml) return with open(run_info_yaml) as in_handle: run_info = yaml.load(in_handle) fc_name, fc_date = get_flowcell_info(proj_conf['flowcell']) low_yield = False bc_multiplier = 0.75 # Should move to cfg file ok_samples = [] low_samples = [] for l in proj_conf['lanes']: bc_file_name_prefix = os.path.join(proj_conf['analysis_dir'], proj_conf['flowcell'], '_'.join([l['lane'], fc_date, fc_name, "nophix_barcode"]), '_'.join([l['lane'], fc_date, fc_name, "nophix"])) bc_file = bc_file_name_prefix + ".bc_metrics" if not os.path.exists(bc_file): bc_file = bc_file_name_prefix + "_bc.metrics" try: bc_file = open(bc_file) except: sys.exit("Could not find bc metrics file " + bc_file) bc_count = {} for line in bc_file: c = line.strip().split() bc_count[c[0]]=c[1] + ' (~' + str (int ( round (float(c[1])/1000000) ) ) + " million)" no_samples = len(bc_count) - 1 if no_samples == 0: print("WARNING: did not find a BC metrics file... Skipping lane %s for %s" % (l['lane'], proj_conf['id'])) continue target_yield_per_sample = '' try: min_reads_per_sample = round(float(str(min_reads_per_sample))) target_yield_per_sample = min_reads_per_sample * 1000000 except ValueError: min_reads_per_sample = '' target_yield_per_sample = bc_multiplier * target_yield_per_lane / no_samples sample_name = {} is_multiplexed = True is_rerun = False # Check here for each sample if it belongs to the project for entry in run_info: if entry['lane'] == l['lane']: projs = set() if 'multiplex' in entry: for sample in entry['multiplex']: if 'sample_prj' in sample: projs.add(sample['sample_prj']) if sample['sample_prj'].strip() == proj_conf['id']: sample_name[sample['barcode_id']] = sample['name'] else: is_multiplexed = False if len(projs) > 1: is_rerun = True samp_count = {} for k in bc_count.keys(): if not k.isdigit(): pass else: if int(k) in sample_name: samp_count[sample_name[int(k)]] = bc_count[k] print "DEBUG: Target yield per sample = ", target_yield_per_sample print "DEBUG: Min reads per sample = ", min_reads_per_sample print "DEBUG: No samples: ", no_samples for k in sorted(samp_count.keys()): comment = '' if int(samp_count[k].split('(')[0]) < target_yield_per_sample: comment = 'Low. ' low_yield = True low_samples.append(k) else: ok_samples.append(k) if is_rerun: comment += '(rerun lane)' tab.add_row([l['lane'], k, samp_count[k], min_reads_per_sample, comment]) if is_multiplexed: comment = '' try: if int(bc_count['unmatched'].split('(')[0]) > target_yield_per_sample: comment = 'High.' if is_rerun: comment += '(rerun lane)' tab.add_row([l['lane'], 'unmatched', bc_count['unmatched'], min_reads_per_sample, comment]) except: print('WARNING: insufficient or no barcode metrics for lane') else: comment = '' for k in bc_count.keys(): if int(bc_count[k].split('(')[0]) < bc_multiplier * target_yield_per_lane: comment = 'Low.' tab.add_row([l['lane'], "Non-multiplexed lane", bc_count[k], min_reads_per_sample, comment]) delivery_type = "Final delivery. " if low_yield: delivery_type = "Partial delivery. " fail_comm = "Samples " + ", ".join(low_samples) + " yielded fewer sequences than expected. These will be re-run unless this was already a re-run and the total yield is now sufficient. " else: fail_comm = "" if low_yield: if len(ok_samples) > 0: ok_comm = "Samples " + ", ".join(ok_samples) + " yielded the expected number of sequences or more. " else: ok_comm = "" else: ok_comm = "All samples yielded the expected number of sequences or more. " comm = d['summary'] + fail_comm + ok_comm d.update(summary=comm) d.update(yieldtable=tab.draw()) return d
def generate_report(proj_conf): d = { 'runname': proj_conf['run'], 'project_id': proj_conf['id'], 'samplenames': ' '.join(proj_conf['samples']), 'latex_opt': "", 'uppnex': "", 'mapping': "", 'dup_rem': "", 'read_count': "", 'quantifyer': "", 'gene_body_cov': "", 'FPKM_heatmap': "", 'FPKM_PCAplot': "", 'Mapping_statistics': "", 'Read_Distribution': "", 'rRNA_table': "" } ## Latex option (no of floats per page) floats_per_page = '.. raw:: latex\n\n \setcounter{totalnumber}{8}' d['latex_opt'] = floats_per_page ## Metadata fetched from the 'Genomics project list' on Google Docs try: proj_data = ProjectMetaData(proj_conf['id'], proj_conf['config']) uppnex_proj = proj_data.uppnex_id except: uppnex_proj = "b201YYXX" print "No uppnex ID fetched" pass if not uppnex_proj: uppnex_proj = "b201YYXX" print "No uppnex ID fetched" d['uppnex'] = uppnex_proj ## RNA-seq tools fetched from config file post_process.yaml try: tools = proj_conf['config']['custom_algorithms']['RNA-seq analysis'] d['mapping'] = os.path.join(tools['aligner'], tools['aligner_version']) d['dup_rem'] = os.path.join(tools['dup_remover'], tools['dup_remover_version']) d['read_count'] = os.path.join(tools['counts'], tools['counts_version']) d['quantifyer'] = os.path.join(tools['quantifyer'], tools['quantifyer_version']) except: print "Could not fetched RNA-seq tools from config file post_process.yaml" d['mapping'] = "X" d['dup_rem'] = "X" d['read_count'] = "X" d['quantifyer'] = "X" pass ## Mapping Statistics tab = Texttable() tab.set_cols_dtype(['t', 't', 't', 't']) tab.add_row([ 'Sample', 'tot_#_read_pairs', '%_uniquely_mapped_reads', '%_uniquely_mapped_reads_left_after_dup_rem' ]) try: for sample_name in proj_conf['samples']: f = open('tophat_out_' + sample_name + '/stat_' + sample_name, 'r') data = f.readlines() tab.add_row([ sample_name, data[1].split()[1], data[2].split()[1], data[3].split()[1] ]) f.close() d['Mapping_statistics'] = tab.draw() except: try: f = open('stat', 'r') data = f.readlines() D = dict( zip(data[0].split(), zip(data[1].split(), data[2].split(), data[3].split()))) for sample_name in proj_conf['samples']: if D.has_key(sample_name): tab.add_row([ sample_name, D[sample_name][0], D[sample_name][1], D[sample_name][2] ]) else: print 'kould not find ' + sample_name + ' in stat' d['Mapping_statistics'] = tab.draw() f.close() except: print "Could not make Mapping Statistics table" pass ## Read Distribution try: tab = Texttable() json = open('Ever_rd.json', 'a') print >> json, '{' Groups = [ "Sample:", "CDS Exons:", "5'UTR Exons:", "3'UTR Exons:", "Intronic region:", "TSS up 1kb:", "TES down 1kb:" ] tab.set_cols_dtype(['t', 't', 't', 't', 't', 't', 't', 't']) tab.add_row([ "Sample", "CDS Exon", "5'UTR Exon", "3'UTR Exon", "Intron", "TSS up 1kb", "TES down 1kb", "mRNA frac" ]) for i in range(len(proj_conf['samples'])): sample_name = proj_conf['samples'][i] print >> json, sample_name + ': {' row = [sample_name] Reads_counts = [] try: f = open('RSeQC_rd_' + sample_name + '.err', 'r') except: f = open('Ever_rd_' + sample_name + '.err', 'r') pass for line in f: Group = line.split('\t')[0] if Group in Groups: if Group == "TES down 1kb:": print >> json, '"' + Group + '"' + ':' + str( line.split('\t')[3].strip()) else: print >> json, '"' + Group + '"' + ':' + str( line.split('\t')[3].strip()) + ',' row.append(str(line.split('\t')[3].strip()) + ' ') Reads_counts.append(float(line.split('\t')[2].strip())) if os.path.exists('RSeQC_rd_' + sample_name + '.err'): t = os.popen("grep 'Total Fragments' 'RSeQC_rd_" + sample_name + ".err'|sed 's/Total Fragments //g'") else: try: t = os.popen( "grep 'Total Fragments' 'Ever_rd_" + sample_name + ".err'|sed 's/Total Fragments //g'") except: pass tot = float(t.readline()) frac = (Reads_counts[0] + Reads_counts[1] + Reads_counts[2]) / tot row.append( str( round( (Reads_counts[0] + Reads_counts[1] + Reads_counts[2]) / tot, 2))) tab.add_row(row) f.close() if i == (len(proj_conf['samples']) - 1): print >> json, '}' else: print >> json, '},' print >> json, '}' json.close() d['Read_Distribution'] = tab.draw() except: print "Could not make Read Distribution table" pass ## FPKM_PCAplot, FPKM_heatmap if os.path.exists("FPKM_PCAplot.pdf") and os.path.exists( "FPKM_heatmap.pdf"): d['FPKM_PCAplot'] = m2r.image("FPKM_PCAplot.pdf", width="100%") d['FPKM_heatmap'] = m2r.image("FPKM_heatmap.pdf", width="100%") else: print "could not make FPKM PCAplot and FPKM heatmap" ## rRNA_table try: tab = Texttable() tab.set_cols_dtype(['t', 't']) tab.add_row(["Sample", "rRNA"]) f = open('rRNA.quantification', 'r') D = {} for line in f: D[str(line.split('\t')[0].strip())] = str( line.split('\t')[1].strip()) for sample_name in proj_conf['samples']: if D.has_key(sample_name): tab.add_row([sample_name, D[sample_name]]) d['rRNA_table'] = tab.draw() f.close() except: print "could not generate rRNA table" pass return d