def rsem(inputs, outputs): """ :params inputs: a tuple of 1 or 2 fastq.gz files, e.g. ('/path/to/rsem_output/homo_sapiens/GSE50599/GSM1224499/SRR968078_1.fastq.gz', '/path/to/rsem_output/homo_sapiens/GSE50599/GSM1224499/SRR968078_2.fastq.gz') """ inputs = [_ for _ in inputs if not _.endswith('.sra2fastq.COMPLETE')] # this is equivalent to the sample.outdir or GSM dir outdir = os.path.dirname(inputs[0]) # the names of parameters are the same as that in gen_qsub_script, but # their values are more or less different, so better keep them separate fastq_gz_input = gen_fastq_gz_input(inputs) res = re.search(PATH_RE, outdir) gse = res.group('GSE') species = res.group('species') gsm = res.group('GSM') reference_name = config['LOCAL_REFERENCE_NAMES'][species] sample_name = '{outdir}/{gsm}'.format(**locals()) n_jobs = options.j_rsem flag_file = outputs[-1] cmd = config['CMD_RSEM'].format(n_jobs=n_jobs, fastq_gz_input=fastq_gz_input, reference_name=reference_name, sample_name=sample_name, output_dir=outdir) misc.execute_log_stdout_stderr(cmd, flag_file=flag_file, debug=options.debug)
def rsem(inputs, outputs): """ :params inputs: a tuple of 1 or 2 fastq.gz files, e.g. ('/path/to/rsem_output/homo_sapiens/GSE50599/GSM1224499/SRR968078_1.fastq.gz', '/path/to/rsem_output/homo_sapiens/GSE50599/GSM1224499/SRR968078_2.fastq.gz') """ inputs = [_ for _ in inputs if not _.endswith('.sra2fastq.COMPLETE')] # this is equivalent to the sample.outdir or GSM dir outdir = os.path.dirname(inputs[0]) # the names of parameters are the same as that in gen_qsub_script, but # their values are more or less different, so better keep them separate fastq_gz_input = gen_fastq_gz_input(inputs) res = re.search(PATH_RE, outdir) gse = res.group('GSE') species = res.group('species') gsm = res.group('GSM') reference_name = config['LOCAL_REFERENCE_NAMES'][species] sample_name = '{outdir}/{gsm}'.format(**locals()) n_jobs = options.j_rsem flag_file = outputs[-1] cmd = config['CMD_RSEM'].format( n_jobs=n_jobs, fastq_gz_input=fastq_gz_input, reference_name=reference_name, sample_name=sample_name, output_dir=outdir) misc.execute_log_stdout_stderr(cmd, flag_file=flag_file, debug=options.debug)
def sra2fastq(inputs, outputs): """for meaning of [SED]RR, see http://www.ncbi.nlm.nih.gov/books/NBK56913/#search.the_entrez_sra_search_response_pa S =NCBI-SRA, E = EMBL-SRA, D = DDBJ-SRA SRR: SRA run accession ERR: ERA run accession DRR: DRA run accession """ sra, _ = inputs # ignore the flag file from previous task flag_file = outputs[-1] outdir = os.path.dirname(os.path.dirname(os.path.dirname(sra))) cmd = config['CMD_FASTQ_DUMP'].format(output_dir=outdir, accession=sra) misc.execute_log_stdout_stderr(cmd, flag_file=flag_file, debug=options.debug)
def main(): options = parse_args_for_rp_transfer() config = misc.get_config(options.config_file) # r_: means relevant to remote host, l_: to local host l_top_outdir = config['LOCAL_TOP_OUTDIR'] r_top_outdir = config['REMOTE_TOP_OUTDIR'] G = PPR.gen_all_samples_from_soft_and_isamp samples = G(options.soft_files, options.isamp, config) PPR.init_sample_outdirs(samples, l_top_outdir) r_host, r_username = config['REMOTE_HOST'], config['USERNAME'] fastq2rsem_ratio = config['FASTQ2RSEM_RATIO'] r_cmd_df = config['REMOTE_CMD_DF'] r_min_free = misc.ugly_usage(config['REMOTE_MIN_FREE']) r_max_usage = misc.ugly_usage(config['REMOTE_MAX_USAGE']) r_free_to_use = calc_remote_free_space_to_use( r_host, r_username, r_top_outdir, l_top_outdir, r_cmd_df, r_max_usage, r_min_free, fastq2rsem_ratio) # tf: transfer/transferred tf_record = os.path.join(l_top_outdir, 'transferred_GSMs.txt') tf_gsms = get_gsms_transferred(tf_record) tf_gsms_bn = map(os.path.basename, tf_gsms) logger.info('Selecting samples to transfer based their estimated remote usage') gsms_to_tf = select_gsms_to_transfer( samples, tf_gsms_bn, l_top_outdir, r_free_to_use, fastq2rsem_ratio) if not gsms_to_tf: logger.info('Cannot find a GSM that fits the current disk usage rule') return logger.info('GSMs to transfer:') for k, gsm in enumerate(gsms_to_tf): logger.info('\t{0:3d} {1:30s} {2}'.format(k+1, gsm, gsm.outdir)) gsms_to_tf_ids = [os.path.relpath(_.outdir, l_top_outdir) for _ in gsms_to_tf] tf_script = write_transfer_sh( gsms_to_tf_ids, options.rsync_template, l_top_outdir, r_username, r_host, r_top_outdir) os.chmod(tf_script, stat.S_IRUSR | stat.S_IWUSR| stat.S_IXUSR) rcode = misc.execute_log_stdout_stderr(tf_script) if rcode == 0: # different from processing in rsempipeline.py, where the completion is # marked by .COMPLETE flags, but by writting the completed GSMs to # gsms_transfer_record append_transfer_record(gsms_to_tf_ids, tf_record)