def _filter_out_genomes(data): """ Filters out genomes found in run_info.yaml """ print "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" print data # genome_build, sam_ref = ref_genome_info(data["info"], config, data["dirs"]) sam_ref = data["sam_ref"] log.info("Removing genome from sample %s" % str(data["name"])) try: # not data ! should reach run_info.yaml somehow from here if data["filter_out_genomes"]: for genome in data["filter_out_genomes"].split(","): (out_file, ext) = os.path.splitext(os.path.basename(fastq1)) out_file = out_file + "-stripped-" + genome + ext cl = [ "bowtie", "--solexa1.3-quals", "--un", out_file, sam_ref, "-1", data["fastq1"], "-2", data["fastq2"], "/dev/null", ] log.info("Running %s" % cl) subprocess.check_call(cl) except KeyError: log.error("Not removing genomes, directive filter_out_genomes undefined in run_info.yaml") pass
def process_lane(info, fc_name, fc_date, dirs, config): """Prepare lanes, potentially splitting based on barcodes. """ config = _update_config_w_custom(config, info) sample_name = info.get("description", "") if (config["algorithm"].get("include_short_name", True) and info.get("name", "")): sample_name = "%s---%s" % (info.get("name", ""), sample_name) genome_build = info.get("genome_build", None) multiplex = info.get("multiplex", None) log.info("Processing sample: %s; lane %s; reference genome %s; " \ "researcher %s; analysis method %s" % (sample_name, info["lane"], genome_build, info.get("researcher", ""), info.get("analysis", ""))) if multiplex: log.debug("Sample %s multiplexed as: %s" % (sample_name, multiplex)) full_fastq1, full_fastq2 = get_fastq_files(dirs["fastq"], info, fc_name) lane_name = "%s_%s_%s" % (info['lane'], fc_date, fc_name) lane_items = [] for mname, msample, fastq1, fastq2 in split_by_barcode(full_fastq1, full_fastq2, multiplex, lane_name, dirs, config): mlane_name = "%s_%s" % (lane_name, mname) if mname else lane_name if msample is None: msample = "%s---%s" % (sample_name, mname) lane_items.append((fastq1, fastq2, genome_build, mlane_name, msample, dirs, config)) return lane_items
def process_sample(sample_name, fastq_files, info, bam_files, dirs, config, config_file): """Finalize processing for a sample, potentially multiplexed. """ config = _update_config_w_custom(config, info) genome_build = info.get("genome_build", None) (_, sam_ref) = get_genome_ref(genome_build, config["algorithm"]["aligner"], dirs["galaxy"]) fastq1, fastq2 = combine_fastq_files(fastq_files, dirs["work"]) log.info("Combining and preparing wig file %s" % str(sample_name)) sort_bam = merge_bam_files(bam_files, dirs["work"], config) (gatk_bam, vrn_file, effects_file) = ("", "", "") if config["algorithm"]["recalibrate"]: log.info("Recalibrating %s with GATK" % str(sample_name)) gatk_bam = recalibrate_quality(sort_bam, fastq1, fastq2, sam_ref, dirs, config) if config["algorithm"]["snpcall"]: log.info("SNP genotyping %s with GATK" % str(sample_name)) vrn_file = run_genotyper(gatk_bam, sam_ref, config) log.info("Calculating variation effects for %s" % str(sample_name)) effects_file = variation_effects(vrn_file, genome_build, config) if config["algorithm"].get("transcript_assemble", False): tx_file = assemble_transcripts(sort_bam, sam_ref, config) if sam_ref is not None: log.info("Generating summary files: %s" % str(sample_name)) generate_align_summary(sort_bam, fastq2 is not None, sam_ref, sample_name, config, dirs) bam_to_wig(sort_bam, config, config_file) return [sample_name, fastq_files, info, sort_bam, gatk_bam, vrn_file, effects_file]
def main(run_name, gdocs_spreadsheet, encoded_credentials_file, run_info_yaml, analysis_dir, archive_dir, gdocs_worksheet, gdocs_projects_folder, append, split_on_project): log.info("Processing run: %s" % run_name) # If not supplied, assume that the configuration file is named run_info.yaml and resides in the archive dir if not run_info_yaml: run_info_yaml = os.path.join(archive_dir,"run_info.yaml") log.info("No configuration file supplied, assuming it is '%s'" % run_info_yaml) if not os.path.exists(run_info_yaml): log.warn("Could not find required run_info.yaml configuration file at '%s'" % run_info_yaml) return with open(run_info_yaml) as in_handle: run_info = {'details': yaml.load(in_handle)} # Get the google docs crdentials gdocs_credentials = "" if not os.path.exists(encoded_credentials_file): log.warn("The Google Docs credentials file could not be found. No demultiplex data was written") return with open(encoded_credentials_file) as fh: gdocs_credentials = fh.read().strip() fc_name, fc_date = get_flowcell_info(run_name) # Get the barcode statistics bc_metrics = get_bc_stats(fc_date,fc_name,analysis_dir,run_info) # Write the report write_run_report_to_gdocs(fc_date,fc_name,bc_metrics,gdocs_spreadsheet,gdocs_credentials,gdocs_worksheet,append,split_on_project) # Write the bc project summary report if gdocs_projects_folder: write_project_report_to_gdocs(fc_date,fc_name,bc_metrics,gdocs_credentials,gdocs_projects_folder)
def process_alignment(fastq1, fastq2, genome_build, lane_name, sample, dirs, config): """Do an alignment of fastq files, preparing a sorted BAM output file. """ aligner = config["algorithm"].get("aligner", None) if os.path.exists(fastq1) and aligner: log.info("Aligning lane %s with %s aligner" % (lane_name, aligner)) align_to_sort_bam(fastq1, fastq2, genome_build, aligner, lane_name, sample, dirs, config)
def long_term_storage(remote_info, config_file): config = load_config(config_file) log_handler = create_log_handler(config, log.name) with log_handler.applicationbound(): log.info("Copying run data over to remote storage: %s" % config["store_host"]) log.debug("The contents from AMQP for this dataset are:\n %s" % remote_info) _copy_for_storage(remote_info, config)
def process_alignment(fastq1, fastq2, info, lane_name, lane_desc, dirs, config): """Do an alignment of fastq files, preparing a sorted BAM output file. """ aligner = config["algorithm"].get("aligner", None) out_bam = "" if os.path.exists(fastq1) and aligner: log.info("Aligning lane %s with %s aligner" % (lane_name, aligner)) out_bam = align_to_sort_bam(fastq1, fastq2, info["genome_build"], aligner, lane_name, lane_desc, dirs, config) return [{"fastq": [fastq1, fastq2], "out_bam": out_bam, "info": info, "config": config}]
def main(flowcell_id, archive_dir, analysis_dir): print " ".join([flowcell_id, archive_dir, analysis_dir]) fp = os.path.join(archive_dir, flowcell_id, "run_info.yaml") with open(fp) as in_handle: run_info = yaml.load(in_handle) project_ids = dict() for lane in run_info: (l, id) = [x.strip() for x in lane['description'].split(",")] if project_ids.has_key(id): project_ids[id].append(lane) else: project_ids[id] = [lane] sphinx_defs = [] for k in project_ids.keys(): lanes = [x['lane'] for x in project_ids[k]] log.info("saw project %s in lanes %s" %( k, ", ".join(lanes))) sphinx_defs.append("('%s', '%s_delivery.tex', 'Delivery note', u'Scilife', 'manual'),\n" % (k, k)) projectfile = "%s.mako" % (k) fp = open(projectfile, "w") fp.write(TEMPLATE) fp.close() mylookup = TemplateLookup(directories=['./']) tmpl = Template(filename=projectfile, lookup=mylookup) proj_conf = { 'id' : k, 'lanes' : project_ids[k], 'archive_dir' : archive_dir, 'analysis_dir' : analysis_dir, 'flowcell' : flowcell_id, } d = generate_report(proj_conf) rstfile = "%s.rst" % (k) fp = open(rstfile, "w") fp.write(tmpl.render(**d)) fp.close() sphinxconf = os.path.join(os.getcwd(), "conf.py") if not os.path.exists(sphinxconf): log.warn("no sphinx configuration file conf.py found: you have to edit conf.py yourself!") else: fp = open(sphinxconf) lines = fp.readlines() fp.close() sdout = [] modify_conf = False for sd in sphinx_defs: if not sd in lines: sdout.append(sd) modify_conf = True if modify_conf: i = lines.index("latex_documents = [\n") newconf = lines[:i+3] + sdout + lines[i+3:] fp = open("conf.py", "w") fp.write("".join(newconf)) fp.close()
def write_run_report_to_gdocs( fc_date, fc_name, bc_metrics, ssheet_title, encoded_credentials, wsheet_title=None, append=False, split_project=False, ): """Upload the barcode read distribution for a run to google docs""" # Connect to google and get the spreadsheet client, ssheet = get_spreadsheet(ssheet_title, encoded_credentials) if not client or not ssheet: return False # Convert the bc_metrics data structure into a flat list rows = _structure_to_list(bc_metrics) # Get the projects in the run projects = _get_unique_project_names(rows) log.info("The run contains data from: '%s'" % "', '".join(projects)) # Calculate the number of million reads for convenience brci = -1 brcmi = -1 for i, head in enumerate(BARCODE_STATS_HEADER): if head[1] == "barcode_read_count": brci = i elif head[1] == "barcode_read_count_millions": brcmi = i if brci >= 0 and brcmi >= 0: for row in rows: try: row[brcmi] = unicode(round(int(row[brci]) / 1000000.0, 2)) except ValueError: pass # If we will split the worksheet by project, use the project names as worksheet titles success = True if split_project: # Filter away the irrelevent project entries and write the remaining to the appropriate worksheet for wsheet_title in projects: success &= _write_to_worksheet( client, ssheet, wsheet_title, _apply_filter(rows, [wsheet_title]), BARCODE_STATS_HEADER, append ) # Else, set the default title of the worksheet to be a string of concatenated date and flowcell id else: if wsheet_title is None: wsheet_title = "%s_%s" % (fc_date, fc_name) success &= _write_to_worksheet(client, ssheet, wsheet_title, rows, BARCODE_STATS_HEADER, append) return success
def _handle_data(src, tgt, f=shutil.copyfile): if src is None: return if os.path.exists(tgt): log.warn("%s already exists: not doing anything!" %(tgt)) return if options.dry_run: print "DRY_RUN: %s file %s to %s" % (f.__name__, src, tgt) else: log.info("%s file %s to %s" % (f.__name__, src, tgt)) f(src, tgt)
def realign_sample(data, region=None, out_file=None): """Realign sample BAM file at indels. """ log.info("Realigning %s with GATK" % str(data["name"])) if data["config"]["algorithm"]["snpcall"]: sam_ref = data["sam_ref"] config = data["config"] data["work_bam"] = gatk_realigner(data["work_bam"], sam_ref, config, configured_ref_file("dbsnp", config, sam_ref), region, out_file) return [data]
def _get_run_info(fc_name, fc_date, config, run_info_yaml): """Retrieve run information from a passed YAML file or the Galaxy API. """ if run_info_yaml and os.path.exists(run_info_yaml): log.info("Found YAML samplesheet, using %s instead of Galaxy API" % run_info_yaml) with open(run_info_yaml) as in_handle: run_details = yaml.load(in_handle) return dict(details=run_details, run_id="") else: log.info("Fetching run details from Galaxy instance") galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) return galaxy_api.run_details(fc_name, fc_date)
def get_run_info(fc_dir, config, run_info_yaml): """Retrieve run information from a passed YAML file or the Galaxy API. """ if run_info_yaml and os.path.exists(run_info_yaml): log.info("Found YAML samplesheet, using %s instead of Galaxy API" % run_info_yaml) fc_name, fc_date, run_info = _run_info_from_yaml(fc_dir, run_info_yaml) else: log.info("Fetching run details from Galaxy instance") fc_name, fc_date = get_flowcell_info(fc_dir) galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) run_info = galaxy_api.run_details(fc_name, fc_date) return fc_name, fc_date, _organize_runs_by_lane(run_info)
def prune_run_info_by_description(run_info, desc, lanes): """Prune a run_info file by lane description""" run_info_ret = list() for info in run_info: if not desc is None: if info['description'].find(desc) > -1 or desc=="ALL": log.info("Found %s in run_info for lane %s, description: %s" %(desc, info['lane'], info['description'] )) run_info_ret.append(info) elif not lanes is None: if info['lane'] in lanes.split(","): log.info("Appending lane %s to output" %(info['lane'])) run_info_ret.append(info) return run_info_ret
def _write_to_worksheet(client, ssheet, wsheet_title, rows, header, append): """Generic method to write a set of rows to a worksheet on google docs""" # Convert the worksheet title to unicode wsheet_title = _to_unicode(wsheet_title) # Add a new worksheet, possibly appending or replacing a pre-existing worksheet according to the append-flag wsheet = bcbio.google.spreadsheet.add_worksheet(client, ssheet, wsheet_title, len(rows) + 1, len(header), append) if wsheet is None: log.info("Could not add a worksheet '%s' to spreadsheet '%s'" % (wsheet_title, ssheet.title.text)) return False # Write the data to the worksheet log.info("Adding data to the '%s' worksheet" % (wsheet_title)) return bcbio.google.spreadsheet.write_rows(client, ssheet, wsheet, [col_header[0] for col_header in header], rows)
def create_bc_report_on_gdocs(fc_date, fc_name, work_dir, run_info, config): """Get the barcode read distribution for a run and upload to google docs""" # Get the required parameters from the post_process.yaml configuration file gdocs = config.get("gdocs_upload", None) if not gdocs: log.info("No GDocs upload section specified in config file, will not upload demultiplex data") return # Get the GDocs demultiplex result file title gdocs_spreadsheet = gdocs.get("gdocs_dmplx_file", None) if not gdocs_spreadsheet: log.warn( "Could not find Google Docs demultiplex results file title in config. No demultiplex counts were written to Google Docs" ) return # Get the account credentials encoded_credentials = "" encoded_credentials_file = gdocs.get("gdocs_credentials", None) if not encoded_credentials_file: log.warn("Could not find Google Docs account credentials. No demultiplex report was written") return # Check if the credentials file exists if not os.path.exists(encoded_credentials_file): log.warn("The Google Docs credentials file could not be found. No demultiplex data was written") return with open(encoded_credentials_file) as fh: encoded_credentials = fh.read().strip() # Get the barcode statistics. Get a deep copy of the run_info since we will modify it bc_metrics = get_bc_stats(fc_date, fc_name, work_dir, copy.deepcopy(run_info)) # Upload the data write_run_report_to_gdocs(fc_date, fc_name, bc_metrics, gdocs_spreadsheet, encoded_credentials) # Get the projects parent folder projects_folder = gdocs.get("gdocs_projects_folder", None) # Write the bc project summary report if projects_folder: write_project_report_to_gdocs(fc_date, fc_name, bc_metrics, encoded_credentials, projects_folder)
def process_lane(info, config, dirs): """Models bcbio process lane""" sample_name = info.get("description", "") genome_build = info.get("genome_build", None) multiplex = info.get('multiplex', None) log.info("Processing sample: %s; lane %s; reference genome %s" % (sample_name, info["lane"], genome_build)) if multiplex: log.debug("Sample %s is multiplexed as: %s" % (sample_name, multiplex)) fq = get_barcoded_fastq_files(multiplex, info, dirs['fc_dir'], config['fc_name'], config['fc_date']) ## Move data along with fastq files fc_bc_dir = os.path.join(config['data_delivery_dir'], "%s_%s_%s_barcode" % (info['lane'], config['fc_date'], config['fc_name'])) _make_dir(fc_bc_dir, "fastq.txt barcode directory") if not options.only_fastq: data, fastqc = _get_analysis_results(config, dirs, info['lane']) _deliver_data(data, fastqc, config['data_delivery_dir']) for fqpair in fq: [_deliver_fastq_file(fq_src, os.path.basename(fq_src), fc_bc_dir) for fq_src in fqpair]
def write_project_report_to_gdocs(fc_date, fc_name, project_bc_metrics, encoded_credentials, gdocs_folder=""): """Upload the sample read distribution for a project to google docs""" # Create a client class which will make HTTP requests with Google Docs server. client = bcbio.google.spreadsheet.get_client(encoded_credentials) doc_client = bcbio.google.document.get_client(encoded_credentials) # Get a reference to the parent folder parent_folder = bcbio.google.document.get_folder(doc_client, gdocs_folder) # Group the barcode data by project grouped = group_bc_stats(project_bc_metrics) # Loop over the projects and write the project summary for each for pdata in grouped: project_name = pdata.get("project_name", "") ssheet_title = project_name + "_sequencing_results" ssheet = bcbio.google.spreadsheet.get_spreadsheet(client, ssheet_title) if not ssheet: bcbio.google.document.add_spreadsheet(doc_client, ssheet_title) ssheet = bcbio.google.spreadsheet.get_spreadsheet(client, ssheet_title) _write_project_report_to_gdocs(client, ssheet, fc_date, fc_name, pdata) _write_project_report_summary_to_gdocs(client, ssheet) # Just to make it look a bit nicer, remove the default 'Sheet1' worksheet wsheet = bcbio.google.spreadsheet.get_worksheet(client, ssheet, "Sheet 1") if wsheet: client.DeleteWorksheet(wsheet) folder_name = project_name folder = bcbio.google.document.get_folder(doc_client, folder_name) if not folder: log.info("creating folder '%s'" % _from_unicode(folder_name)) folder = bcbio.google.document.add_folder(doc_client, folder_name, parent_folder) ssheet = bcbio.google.document.move_to_folder(doc_client, ssheet, folder) log.info( "'%s' spreadsheet written to folder '%s'" % (_from_unicode(ssheet.title.text), _from_unicode(folder_name)) )
def make_lane_items(info, fc_date, fc_name, dirs, config): sample_name = info.get("description", "") if (config["algorithm"].get("include_short_name", True) and info.get("name", "")): sample_name = "%s---%s" % (info.get("name", ""), sample_name) genome_build = info.get("genome_build", None) multiplex = info.get("multiplex", "") log.info("Processing sample: %s; lane %s; reference genome %s; " \ "researcher %s; analysis method %s" % (sample_name, info["lane"], genome_build, info.get("researcher", ""), info.get("analysis", ""))) lane_items = [] if multiplex: log.debug("Sample %s is multiplexed as: %s" % (sample_name, multiplex)) mitems = get_multiplex_items(multiplex, info['lane'], dirs['fc_dir'], fc_name, fc_date) for fastq1, fastq2, mlane_name, msample in mitems: lane_items.append((fastq1, fastq2, genome_build, mlane_name, msample, dirs, config)) else: # TODO: Not multiplex: what to do? pass return lane_items
def get_spreadsheet(ssheet_title,encoded_credentials): """Connect to Google docs and get a spreadsheet""" # Convert the spreadsheet title to unicode ssheet_title = _to_unicode(ssheet_title) # Create a client class which will make HTTP requests with Google Docs server. client = bcbio.google.spreadsheet.get_client() bcbio.google.connection.authenticate(client,encoded_credentials) # Locate the spreadsheet ssheet = bcbio.google.spreadsheet.get_spreadsheet(client,ssheet_title) # Check that we got a result back if not ssheet: log.warn("No document with specified title '%s' found in GoogleDocs repository" % ssheet_title) return (None,None) log.info("Found spreadsheet matching the supplied title: '%s'" % (ssheet.title.text)) return (client,ssheet)
def _remote_copy(remote_info, config): """Securely copy files from remote directory to the processing server. This requires ssh public keys to be setup so that no password entry is necessary. """ fc_dir = os.path.join(config["analysis"]["store_dir"], os.path.basename(remote_info['directory'])) log.info("Copying analysis files to %s" % fc_dir) if not fabric_files.exists(fc_dir): fabric.run("mkdir %s" % fc_dir) for fcopy in remote_info['to_copy']: target_loc = os.path.join(fc_dir, fcopy) if not fabric_files.exists(target_loc): target_dir = os.path.dirname(target_loc) if not fabric_files.exists(target_dir): fabric.run("mkdir -p %s" % target_dir) cl = ["scp", "-r", "%s@%s:%s/%s" % (remote_info["user"], remote_info["hostname"], remote_info["directory"], fcopy), target_loc] fabric.run(" ".join(cl)) log.info("Analysis files copied") return fc_dir
def _copy_for_storage(remote_info, config): """Securely copy files from remote directory to the storage server. This requires ssh public keys to be setup so that no password entry is necessary, Fabric is used to manage setting up copies on the remote storage server. """ log.info("Copying run data over to remote storage: %s" % config["store_host"]) log.debug("The contents from AMQP for this dataset are:\n %s" % remote_info) base_dir = config["store_dir"] fabric.env.host_string = "%s@%s" % (config["store_user"], config["store_host"]) fc_dir = os.path.join(base_dir, os.path.basename(remote_info['directory'])) if not fabric_files.exists(fc_dir): fabric.run("mkdir %s" % fc_dir) for fcopy in remote_info['to_copy']: target_loc = os.path.join(fc_dir, fcopy) if not fabric_files.exists(target_loc): target_dir = os.path.dirname(target_loc) if not fabric_files.exists(target_dir): fabric.run("mkdir -p %s" % target_dir) cl = ["scp", "-r", "%s@%s:%s/%s" % ( remote_info["user"], remote_info["hostname"], remote_info["directory"], fcopy), target_loc] fabric.run(" ".join(cl))
def _make_dir(dir, label): if not os.path.exists(dir): os.makedirs(dir) log.info("Creating %s directory %s" % (label, dir)) else: log.warn("%s already exists: not creating new directory" % (dir))
def main(flowcell_id, archive_dir, analysis_dir, config_file): print " ".join([flowcell_id, archive_dir, analysis_dir]) fp = os.path.join(archive_dir, flowcell_id, "run_info.yaml") with open(fp) as in_handle: run_info = yaml.load(in_handle) if config_file: with open(config_file) as in_handle: config = yaml.load(in_handle) else: config = {} project_ids = dict() for lane in run_info: (l, id) = [x.strip() for x in lane['description'].split(",")] if project_ids.has_key(id): if not lane in project_ids[id]: project_ids[id].append(lane) else: project_ids[id] = [lane] # Check here if project is a "sub project" of the lane if not lane.has_key('multiplex'): continue for s in lane['multiplex']: if s.has_key('description'): if project_ids.has_key(s['description']): if lane not in project_ids[s['description']]: project_ids[s['description']].append(lane) else: project_ids[s['description']] = [lane] sphinx_defs = [] for k in project_ids.keys(): lanes = [x['lane'] for x in project_ids[k]] proj_file_tag = k + "_" + get_flowcell_info(flowcell_id)[1] + get_flowcell_info(flowcell_id)[0][0] log.info("saw project %s in lanes %s" %( k, ", ".join(lanes))) sphinx_defs.append("('%s', '%s_delivery.tex', 'Raw data delivery note', u'SciLifeLab Stockholm', 'howto'),\n" % (proj_file_tag, proj_file_tag)) projectfile = "%s.mako" % (proj_file_tag) fp = open(projectfile, "w") fp.write(TEMPLATE) fp.close() mylookup = TemplateLookup(directories=['./']) tmpl = Template(filename=projectfile, lookup=mylookup) proj_conf = { 'id' : k, 'lanes' : project_ids[k], 'archive_dir' : archive_dir, 'analysis_dir' : analysis_dir, 'flowcell' : flowcell_id, 'config' : config, } d = generate_report(proj_conf) rstfile = "%s.rst" % (proj_file_tag) fp = open(rstfile, "w") fp.write(tmpl.render(**d)) fp.close() sphinxconf = os.path.join(os.getcwd(), "conf.py") if not os.path.exists(sphinxconf): log.warn("no sphinx configuration file conf.py found: you have to edit conf.py yourself!") else: fp = open(sphinxconf) lines = fp.readlines() fp.close() sdout = [] modify_conf = False for sd in sphinx_defs: if not sd in lines: sdout.append(sd) modify_conf = True if modify_conf: i = lines.index("latex_documents = [\n") newconf = lines[:i+3] + sdout + lines[i+3:] fp = open("conf.py", "w") fp.write("".join(newconf)) fp.close()
def generate_report(proj_conf): ####### ### Metadata fetched from the 'Genomics project list' on Google Docs ### proj_data = ProjectMetaData(proj_conf['id'], proj_conf['config']) uppnex_proj = proj_data.uppnex_id project_id = proj_data.project_id queue_date = proj_data.queue_date no_samples = proj_data.no_samples lanes_plates = proj_data.lanes_plates min_reads_per_sample = proj_data.min_reads_per_sample customer_reference = proj_data.customer_reference application = proj_data.application no_finished_samples = proj_data.no_finished_samples d = { 'project_id' : proj_conf['id'], 'latex_opt' : "", 'summary' : "", 'infotable' : "", 'lanetable' : "", 'read1table': "", 'read2table': "", 'qcplots': "", 'qc30plots': "", 'errorrate': "", 'yieldtable': "", } ## Latex option (no of floats per page) floats_per_page = '.. raw:: latex\n\n \setcounter{totalnumber}{8}' d.update(latex_opt = floats_per_page) ## General info table tab = Texttable() if not uppnex_proj or len(uppnex_proj) < 4 or uppnex_proj[0:4] != 'b201': uppnex_proj = "b201YXXX" run_name_comp = proj_conf['flowcell'].split('_') simple_run_name = run_name_comp[0] + run_name_comp[3][0] instr_id = run_name_comp[1] fc_name, fc_date = get_flowcell_info(proj_conf['flowcell']) tab.add_row(["Run name:", proj_conf['flowcell']]) tab.add_rows([["Project id:", proj_conf['id']], ["Date:", fc_date], ["Instrument ID:", instr_id], ["Flow cell ID:", fc_name], ["Uppnex project:", uppnex_proj], ["Delivery directory:", "/bubo/proj/" + uppnex_proj + "/INBOX/20" + simple_run_name + "_hiseq2000"]]) d.update(infotable=tab.draw()) ## Lane table tab = Texttable() tab.add_row(["Lane", "Sample(s)"]) for l in proj_conf['lanes']: main_proj = l['description'].split(',')[1].strip() if main_proj == proj_conf['id']: is_main_proj = True else: is_main_proj = False samples = [] if l.has_key('multiplex'): for mp in l['multiplex']: if mp.has_key('description'): if mp['description'] == proj_conf['id']: samples.append(mp['name']) elif is_main_proj: samples.append(mp['name']) tab.add_row([l['lane'], ", ".join(samples)]) else: tab.add_row([l['lane'], "Non-multiplexed lane"]) d.update(lanetable=tab.draw()) tab_r1 = Texttable() tab_r2 = Texttable() tab_r1.set_cols_width([2,12,12,12,12,12,12,30]) tab_r2.set_cols_width([2,12,12,12,12,12,12,30]) tab_r1.add_row(["Lane", "Clu. dens. #/mm2","% PF clusters","Clu. PF #/mm2", "% phas/prephas", "% aln PhiX", "% error rate", "Comment"]) tab_r2.add_row(["Lane", "Clu. dens. #/mm2","% PF clusters","Clu. PF #/mm2", "% phas/prephas", "% aln PhiX", "% error rate", "Comment"]) # These should be moved to a cfg file. ( + perhaps provide an alternative for v1.5 FC ) if (options.v1_5_fc): min_clupf = 300 else: min_clupf = 475 max_phas = 0.4 max_prephas = 1.0 # 0.5 max_mean_err = 2 statspath = os.path.join(proj_conf['archive_dir'], proj_conf['flowcell'], "Data", "reports", "Summary") stats = summ.getQCstats(statspath) # Check quality criteria and add comments comm_r1 = '' comm_r2 = '' ok_r1 = True ok_r2 = True ok_cludens_r1 = True ok_cludens_r2 = True ok_phasing_r1 = True ok_phasing_r2 = True ok_prephasing_r1 = True ok_prephasing_r2 = True ok_err_rate = True ok_err_r1 = True ok_err_r2 = True for l in proj_conf['lanes']: # Cluster densities clu_dens_r1 = stats['raw_cluster_dens']['read1'][l['lane']] clu_dens_r2 = stats['raw_cluster_dens']['read2'][l['lane']] clu_dens_sd_r1 = stats['raw_cluster_dens_sd']['read1'][l['lane']] clu_dens_sd_r2 = stats['raw_cluster_dens_sd']['read2'][l['lane']] clu_dens_string_r1 = str(clu_dens_r1) + '+/-' + str(clu_dens_sd_r1) clu_dens_string_r2 = str(clu_dens_r2) + '+/-' + str(clu_dens_sd_r2) # Cluster PF densities clu_dens_pf_r1 = stats['pf_cluster_dens']['read1'][l['lane']] clu_dens_pf_r2 = stats['pf_cluster_dens']['read2'][l['lane']] clu_dens_pf_sd_r1 = stats['pf_cluster_dens_sd']['read1'][l['lane']] clu_dens_pf_sd_r2 = stats['pf_cluster_dens_sd']['read2'][l['lane']] clu_dens_pf_string_r1 = str(clu_dens_pf_r1) + '+/-' + str(clu_dens_pf_sd_r1) clu_dens_pf_string_r2 = str(clu_dens_pf_r2) + '+/-' + str(clu_dens_pf_sd_r2) # % PF clusters prc_pf_r1 = stats['prc_pf']['read1'][l['lane']] prc_pf_r2 = stats['prc_pf']['read2'][l['lane']] prc_pf_sd_r1 = stats['prc_pf_sd']['read1'][l['lane']] prc_pf_sd_r2 = stats['prc_pf_sd']['read2'][l['lane']] prc_pf_string_r1 = str(prc_pf_r1) + '+/-' + str(prc_pf_sd_r1) prc_pf_string_r2 = str(prc_pf_r2) + '+/-' + str(prc_pf_sd_r2) # % phasing and prephasing phas_r1 = stats['phasing']['read1'][l['lane']] phas_r2 = stats['phasing']['read2'][l['lane']] prephas_r1 = stats['prephasing']['read1'][l['lane']] prephas_r2 = stats['prephasing']['read2'][l['lane']] phas_string_r1 = str(phas_r1) + '/' + str(prephas_r1) phas_string_r2 = str(phas_r2) + '/' + str(prephas_r2) # % aligned aln_r1 = stats['prc_aligned']['read1'][l['lane']] aln_r2 = stats['prc_aligned']['read2'][l['lane']] aln_sd_r1 = stats['prc_aligned_sd']['read1'][l['lane']] aln_sd_r2 = stats['prc_aligned_sd']['read2'][l['lane']] aln_string_r1 = str(aln_r1) + '+/-' + str(aln_sd_r1) aln_string_r2 = str(aln_r2) + '+/-' + str(aln_sd_r2) # error rate err_r1 = stats['error_rate']['read1'][l['lane']] err_r2 = stats['error_rate']['read2'][l['lane']] err_sd_r1 = stats['error_rate_sd']['read1'][l['lane']] err_sd_r2 = stats['error_rate_sd']['read2'][l['lane']] err_str_r1 = str(err_r1) + '+/-' + str(err_sd_r1) err_str_r2 = str(err_r2) + '+/-' + str(err_sd_r2) comm_r1 = "" comm_r2 = "" # check criteria if float(clu_dens_pf_r1[:-1]) < min_clupf: ok_r1 = False ok_cludens_r1 = False comm_r1 += "Low cluster density. " if float(clu_dens_pf_r2[:-1]) < min_clupf: ok_r2 = False ok_cludens_r2 = False comm_r2 += "Low cluster density. " if float(phas_r1) > max_phas: ok_r1 = False ok_phasing_r1 = False comm_r1 += "High phasing. " if float(phas_r2) > max_phas: ok_r2 = False ok_phasing_r2 = False comm_r2 += "High phasing. " if float(prephas_r1) > max_prephas: ok_r1 = False ok_prephasing_r1 = False comm_r1 += "High prephasing. " if float(prephas_r2) > max_prephas: ok_r2 = False ok_prephasing_r2 = False comm_r2 += "High prephasing. " avg_error_rate = (float(err_r1) + float(err_r2))/2 if avg_error_rate > max_mean_err: ok_err_rate = False if float(err_r1) > max_mean_err: #ok_r1 = False comm_r1 += "High error rate. " ok_err_r1 = False if float(err_r2) > max_mean_err: #ok_r2 = False comm_r2 += "High error rate. " ok_err_r2 = False if comm_r1 == "": comm_r1 = "OK" if comm_r2 == "": comm_r2 = "OK" tab_r1.add_row([l['lane'], clu_dens_string_r1, prc_pf_string_r1, clu_dens_pf_string_r1, phas_string_r1, aln_string_r1, err_str_r1, comm_r1]) tab_r2.add_row([l['lane'], clu_dens_string_r2, prc_pf_string_r2, clu_dens_pf_string_r2, phas_string_r2, aln_string_r2, err_str_r2, comm_r2]) # Reinitialize comments for the summary. (Which will be for several lanes, potentially) comm_r1 = "" comm_r2 = "" if not ok_cludens_r1: comm_r1 += "Low cluster density. " if not ok_cludens_r2: comm_r2 += "Low cluster density. " if not ok_phasing_r1: comm_r1 += "High phasing. " if not ok_phasing_r2: comm_r2 += "High phasing. " if not ok_prephasing_r1: comm_r1 += "High prephasing. " if not ok_prephasing_r2: comm_r2 += "High prephasing. " if not ok_err_rate: if not ok_err_r1: ok_r1 = False comm_r1 += "High error rate. " if not ok_err_r2: ok_r2 = False comm_r2 += "High error rate. " if (ok_r1 and ok_r2): comm_r1 = comm_r2 = "OK" d.update(summary = "Successful run according to QC criteria. ") else: if (ok_r1): comm_r1 = "OK" d.update (summary = "Read 2 did not pass quality criteria: " + comm_r2) elif (ok_r2): comm_r2 = "OK" d.update (summary = "Read 1 did not pass quality criteria: " + comm_r1) else: d.update (summary = "Did not pass quality criteria. Read 1: " + comm_r1 + " Read 2: " + comm_r2) d.update(read1table=tab_r1.draw()) d.update(read2table=tab_r2.draw()) ## qcplots byCycleDir = os.path.join(proj_conf['archive_dir'], proj_conf['flowcell'], "Data", "reports", "ByCycle") res = [] for l in proj_conf['lanes']: res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "QScore_L%s.png" % (l['lane']))), width="100%")) d.update(qcplots= "\n".join(res)) ## qc30plots res = [] for l in proj_conf['lanes']: res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "NumGT30_L%s.png" % (l['lane']))), width="100%")) d.update(qc30plots= "\n".join(res)) ## qcplots res = [] for l in proj_conf['lanes']: res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "ErrRate_L%s.png" % (l['lane']))), width="100%")) d.update(errorrate= "\n".join(res)) ## Sequence yield table target_yield_per_lane = 143000000.0 if (options.v1_5_fc): target_yield_per_lane = 60000000.0 tab = Texttable() tab.add_row(['Lane','Sample','Number of sequences','Comment']) run_info_yaml = os.path.join(proj_conf['archive_dir'],proj_conf['flowcell'],"run_info.yaml") if not os.path.exists(run_info_yaml): log.warn("Could not find required run_info.yaml configuration file at '%s'" % run_info_yaml) return #with open(run_info_yaml) as in_handle: # run_info = {'details': yaml.load(in_handle)} with open(run_info_yaml) as in_handle: run_info = yaml.load(in_handle) # fc_name, fc_date = get_flowcell_info(proj_conf['flowcell']) # bc_yield = bc_metrics.get_bc_stats(fc_date,fc_name,proj_conf['analysis_dir'], run_info) fc_name, fc_date = get_flowcell_info(proj_conf['flowcell']) low_yield = False bc_multiplier = 0.75 # Should move to cfg file ok_samples = [] low_samples = [] for l in proj_conf['lanes']: bc_file_name = os.path.join(proj_conf['analysis_dir'], proj_conf['flowcell'], '_'.join([l['lane'], fc_date, fc_name, "barcode"]), '_'.join([l['lane'], fc_date, fc_name, "bc.metrics"])) try: bc_file = open(bc_file_name) except: sys.exit("Could not find bc metrics file " + bc_file_name) bc_count = {} for line in bc_file: c = line.strip().split() bc_count[c[0]]=c[1] + ' (~' + str (int ( round (float(c[1])/1000000) ) ) + " million)" no_samples = len(bc_count) if no_samples == 0: log.warn("Did not find a BC metrics file... Skipping lane %s for %s" %(l['lane'], proj_conf['id'])) continue target_yield_per_sample = bc_multiplier * target_yield_per_lane / no_samples sample_name = {} is_multiplexed = True is_rerun = False # Check here for each sample if it belongs to the project for entry in run_info: if entry['lane'] == l['lane']: is_main_proj = False if entry['description'].split(',')[1].strip() == proj_conf['id']: is_main_proj = True if entry.has_key('multiplex'): for sample in entry['multiplex']: if sample.has_key('description'): if is_main_proj: log.info('Rerun lane: skipping sample ' + sample['name'] + ' in lane ' + l['lane'] + ' which does not belong to the current project') is_rerun=True else: if sample['description'].strip() == proj_conf['id']: sample_name[sample['barcode_id']]=sample['name'] is_rerun = True elif is_main_proj: sample_name[sample['barcode_id']]=sample['name'] else: is_multiplexed = False samp_count = {} for k in bc_count.keys(): if not k.isdigit(): pass else: if sample_name.has_key(int(k)): samp_count[sample_name[int(k)]] = bc_count[k] for k in sorted(samp_count.keys()): comment = '' if int(samp_count[k].split('(')[0]) < target_yield_per_sample: comment = 'Low. ' low_yield = True low_samples.append(k) else: ok_samples.append(k) if is_rerun: comment += '(rerun lane)' tab.add_row([l['lane'], k, samp_count[k], comment]) if is_multiplexed: comment = '' try: if int (bc_count['unmatched'].split('(')[0]) > target_yield_per_sample: comment = 'High.' if is_rerun: comment += '(rerun lane)' tab.add_row([l['lane'], 'unmatched', bc_count['unmatched'], comment]) except: log.warning('Unsufficient or no barcode metrics for lane') else: comment = '' for k in bc_count.keys(): if int (bc_count[k].split('(')[0]) < bc_multiplier * target_yield_per_lane: comment = 'Low.' tab.add_row([l['lane'], "Non-multiplexed lane", bc_count[k], comment]) # if low_yield: # comm = d['summary'] + " Some samples had low yields." # d.update(summary = comm) delivery_type = "Final delivery. " if low_yield: delivery_type = "Partial delivery. " fail_comm = "Samples " + ", ".join(low_samples) + " yielded fewer sequences than expected. These will be re-run unless this was already a re-run and the total yield is now sufficient. " else: fail_comm = "" if low_yield: if len(ok_samples)>0: ok_comm = "Samples " + ", ".join(ok_samples) + " yielded the expected number of sequences or more. " else: ok_comm = "" else: ok_comm = "All samples yielded the expected number of sequences or more. " #comm = delivery_type + d['summary'] + fail_comm + ok_comm comm = d['summary'] + fail_comm + ok_comm d.update(summary = comm) d.update(yieldtable=tab.draw()) return d