def process_lane(info, fc_name, fc_date, dirs, config): """Prepare lanes, potentially splitting based on barcodes. """ config = _update_config_w_custom(config, info) sample_name = info.get("description", "") if (config["algorithm"].get("include_short_name", True) and info.get("name", "")): sample_name = "%s---%s" % (info.get("name", ""), sample_name) genome_build = info.get("genome_build", None) multiplex = info.get("multiplex", None) log.info("Processing sample: %s; lane %s; reference genome %s; " \ "researcher %s; analysis method %s" % (sample_name, info["lane"], genome_build, info.get("researcher", ""), info.get("analysis", ""))) if multiplex: log.debug("Sample %s multiplexed as: %s" % (sample_name, multiplex)) full_fastq1, full_fastq2 = get_fastq_files(dirs["fastq"], info, fc_name) lane_name = "%s_%s_%s" % (info['lane'], fc_date, fc_name) lane_items = [] for mname, msample, fastq1, fastq2 in split_by_barcode(full_fastq1, full_fastq2, multiplex, lane_name, dirs, config): mlane_name = "%s_%s" % (lane_name, mname) if mname else lane_name if msample is None: msample = "%s---%s" % (sample_name, mname) lane_items.append((fastq1, fastq2, genome_build, mlane_name, msample, dirs, config)) return lane_items
def long_term_storage(remote_info, config_file): config = load_config(config_file) log_handler = create_log_handler(config, log.name) with log_handler.applicationbound(): log.info("Copying run data over to remote storage: %s" % config["store_host"]) log.debug("The contents from AMQP for this dataset are:\n %s" % remote_info) _copy_for_storage(remote_info, config)
def _remove_transferred_files(remote_info, config): """Remove the files transferred in a previous test. """ copy_to = os.path.realpath("../transfer_data/copy_to") with fabric.settings(host_string="%s@%s" % \ (config["store_user"], config["store_host"])): rm_str = "rm -r %s/%s" % \ (copy_to, os.path.split(remote_info["directory"])[1]) log.debug(rm_str) fabric.run(rm_str)
def _copy_from_sequencer(remote_info, config): """Get local directory of flowcell info, or copy from sequencer. """ if remote_info.has_key("fc_dir"): fc_dir = remote_info["fc_dir"] assert os.path.exists(fc_dir) else: log.debug("Remote host information: %s" % remote_info) c_host_str = _config_hosts(config) with fabric.settings(host_string=c_host_str): fc_dir = _remote_copy(remote_info, config) return fc_dir
def _copy_from_sequencer(remote_info, config): """Get local directory of flowcell info, or copy from sequencer. """ if "fc_dir" in remote_info: fc_dir = remote_info["fc_dir"] assert os.path.exists(fc_dir) else: log.debug("Remote host information: %s" % remote_info) c_host_str = _config_hosts(config) with fabric.settings(host_string=c_host_str): base_dir = config["store_dir"] try: protocol = config["transfer_protocol"] except KeyError: protocol = None pass fc_dir = remote_copy(remote_info, base_dir, protocol) return fc_dir
def process_lane(info, config, dirs): """Models bcbio process lane""" sample_name = info.get("description", "") genome_build = info.get("genome_build", None) multiplex = info.get('multiplex', None) log.info("Processing sample: %s; lane %s; reference genome %s" % (sample_name, info["lane"], genome_build)) if multiplex: log.debug("Sample %s is multiplexed as: %s" % (sample_name, multiplex)) fq = get_barcoded_fastq_files(multiplex, info, dirs['fc_dir'], config['fc_name'], config['fc_date']) ## Move data along with fastq files fc_bc_dir = os.path.join(config['data_delivery_dir'], "%s_%s_%s_barcode" % (info['lane'], config['fc_date'], config['fc_name'])) _make_dir(fc_bc_dir, "fastq.txt barcode directory") if not options.only_fastq: data, fastqc = _get_analysis_results(config, dirs, info['lane']) _deliver_data(data, fastqc, config['data_delivery_dir']) for fqpair in fq: [_deliver_fastq_file(fq_src, os.path.basename(fq_src), fc_bc_dir) for fq_src in fqpair]
def make_lane_items(info, fc_date, fc_name, dirs, config): sample_name = info.get("description", "") if (config["algorithm"].get("include_short_name", True) and info.get("name", "")): sample_name = "%s---%s" % (info.get("name", ""), sample_name) genome_build = info.get("genome_build", None) multiplex = info.get("multiplex", "") log.info("Processing sample: %s; lane %s; reference genome %s; " \ "researcher %s; analysis method %s" % (sample_name, info["lane"], genome_build, info.get("researcher", ""), info.get("analysis", ""))) lane_items = [] if multiplex: log.debug("Sample %s is multiplexed as: %s" % (sample_name, multiplex)) mitems = get_multiplex_items(multiplex, info['lane'], dirs['fc_dir'], fc_name, fc_date) for fastq1, fastq2, mlane_name, msample in mitems: lane_items.append((fastq1, fastq2, genome_build, mlane_name, msample, dirs, config)) else: # TODO: Not multiplex: what to do? pass return lane_items
def remote_copy(remote_info, base_dir, protocol): """Securely copy files between servers. """ fc_dir = base_dir if not fabric_files.exists(fc_dir): fabric.run("mkdir %s" % fc_dir) if protocol == "scp" or protocol == None: for fcopy in remote_info["to_copy"]: target_loc = os.path.join(fc_dir, os.path.basename(remote_info['directory']), fcopy) if not fabric_files.exists(target_loc): target_dir = os.path.dirname(target_loc) if not fabric_files.exists(target_dir): fabric.run("mkdir -p %s" % target_dir) cl = ["scp", "-r", "%s@%s:%s/%s" % (remote_info["user"], remote_info["hostname"], remote_info["directory"], fcopy), target_loc] log.debug(cl) fabric.run(" ".join(cl)) elif protocol == "rsync": include = [] for fcopy in remote_info['to_copy']: include.append("--include='%s**/*'" % (fcopy,)) include.append("--include='%s'" % (fcopy,)) # By including both these patterns we get the entire directory # if a directory is given, or a single file if a single file is # given. cl = ["rsync", "--checksum", "--archive", \ "--compress", "--partial", "--progress", \ "--prune-empty-dirs", "--verbose", "--include='*/'", \ " ".join(include), "--exclude='*'", \ "%s@%s:%s" % (remote_info["user"], remote_info["hostname"], \ remote_info["directory"]), fc_dir] log.debug(cl) fabric.run(" ".join(cl)) # Note: rdiff-backup doesn't have the ability to resume a partial transfer, # and will instead transfer the backup from the beginning if it detects a # partial transfer. elif protocol == "rdiff-backup": include = [] for fcopy in remote_info['to_copy']: include.append("--include %s/%s" % \ (remote_info["directory"], fcopy)) cl = ["rdiff-backup", " ".join(include), "--exclude '**'", "%s@%s::%s" % (remote_info["user"], remote_info["hostname"], remote_info["directory"]), fc_dir] log.debug(cl) fabric.run(" ".join(cl)) return fc_dir
def process_lane(lane_items, fc_name, fc_date, dirs, config): """Prepare lanes, potentially splitting based on barcodes. """ lane_name = "%s_%s_%s" % (lane_items[0]["lane"], fc_date, fc_name) log.debug("Demulitplexing %s" % lane_name) full_fastq1, full_fastq2 = get_fastq_files(dirs["fastq"], lane_items[0], fc_name) bc_files = split_by_barcode(full_fastq1, full_fastq2, lane_items, lane_name, dirs, config) out = [] for item in lane_items: config = _update_config_w_custom(config, item) # Can specify all barcodes but might not have actual sequences # Would be nice to have a good way to check this is okay here. if bc_files.has_key(item["barcode_id"]): fastq1, fastq2 = bc_files[item["barcode_id"]] cur_lane_name = lane_name cur_lane_desc = item["description"] if item.get("name", ""): cur_lane_desc = "%s : %s" % (item["name"], cur_lane_desc) if item["barcode_id"] is not None: cur_lane_name += "_%s" % (item["barcode_id"]) out.append((fastq1, fastq2, item, cur_lane_name, cur_lane_desc, dirs, config)) return out
def _copy_for_storage(remote_info, config): """Securely copy files from remote directory to the storage server. This requires ssh public keys to be setup so that no password entry is necessary, Fabric is used to manage setting up copies on the remote storage server. """ log.info("Copying run data over to remote storage: %s" % config["store_host"]) log.debug("The contents from AMQP for this dataset are:\n %s" % remote_info) base_dir = config["store_dir"] fabric.env.host_string = "%s@%s" % (config["store_user"], config["store_host"]) fc_dir = os.path.join(base_dir, os.path.basename(remote_info['directory'])) if not fabric_files.exists(fc_dir): fabric.run("mkdir %s" % fc_dir) for fcopy in remote_info['to_copy']: target_loc = os.path.join(fc_dir, fcopy) if not fabric_files.exists(target_loc): target_dir = os.path.dirname(target_loc) if not fabric_files.exists(target_dir): fabric.run("mkdir -p %s" % target_dir) cl = ["scp", "-r", "%s@%s:%s/%s" % ( remote_info["user"], remote_info["hostname"], remote_info["directory"], fcopy), target_loc] fabric.run(" ".join(cl))
def remote_copy(remote_info, base_dir, protocol): """Securely copy files between servers. """ fc_dir = os.path.join(base_dir, os.path.basename(remote_info['directory'])) if not fabric_files.exists(fc_dir): fabric.run("mkdir %s" % fc_dir) if protocol == "scp" or protocol == None: for fcopy in remote_info['to_copy']: target_loc = os.path.join(fc_dir, fcopy) if not fabric_files.exists(target_loc): target_dir = os.path.dirname(target_loc) if not fabric_files.exists(target_dir): fabric.run("mkdir -p %s" % target_dir) cl = ["scp", "-r", "%s@%s:%s/%s" % (remote_info["user"], remote_info["hostname"], remote_info["directory"], fcopy), target_loc] log.debug(cl) fabric.run(" ".join(cl)) elif protocol == "rsync": for fcopy in remote_info['to_copy']: target_loc = os.path.join(fc_dir, fcopy) target_dir = os.path.dirname(target_loc) if not fabric_files.exists(target_dir): fabric.run("mkdir -p %s" % target_dir) if os.path.isdir("%s/%s" % (remote_info["directory"], fcopy)) \ and fcopy[-1] != "/": fcopy += "/" # Option -P --append should enable resuming progress on # partial transfers. cl = ["rsync", "-craz", "-P", "--append", "%s@%s:%s/%s" % (remote_info["user"], remote_info["hostname"], remote_info["directory"], fcopy), fc_dir] log.debug(cl) fabric.run(" ".join(cl)) # Note: rdiff-backup doesn't have the ability to resume a partial transfer, # and will instead transfer the backup from the beginning if it detects a # partial transfer. elif protocol == "rdiff-backup": include = [] for fcopy in remote_info['to_copy']: include.append("--include %s/%s" % \ (remote_info["directory"], fcopy)) cl = ["rdiff-backup", " ".join(include), "--exclude '**'", "%s@%s::%s" % (remote_info["user"], remote_info["hostname"], remote_info["directory"]), fc_dir] log.debug(cl) fabric.run(" ".join(cl)) return fc_dir