Exemple #1
0
def rsem(inputs, outputs):
    """
    :params inputs: a tuple of 1 or 2 fastq.gz files, e.g.
    ('/path/to/rsem_output/homo_sapiens/GSE50599/GSM1224499/SRR968078_1.fastq.gz',
     '/path/to/rsem_output/homo_sapiens/GSE50599/GSM1224499/SRR968078_2.fastq.gz')
    """
    inputs = [_ for _ in inputs if not _.endswith('.sra2fastq.COMPLETE')]
    # this is equivalent to the sample.outdir or GSM dir
    outdir = os.path.dirname(inputs[0])

    # the names of parameters are the same as that in gen_qsub_script, but
    # their values are more or less different, so better keep them separate
    fastq_gz_input = gen_fastq_gz_input(inputs)
    res = re.search(PATH_RE, outdir)
    gse = res.group('GSE')
    species = res.group('species')
    gsm = res.group('GSM')
    reference_name = config['LOCAL_REFERENCE_NAMES'][species]
    sample_name = '{outdir}/{gsm}'.format(**locals())
    n_jobs = options.j_rsem

    flag_file = outputs[-1]
    cmd = config['CMD_RSEM'].format(n_jobs=n_jobs,
                                    fastq_gz_input=fastq_gz_input,
                                    reference_name=reference_name,
                                    sample_name=sample_name,
                                    output_dir=outdir)
    misc.execute_log_stdout_stderr(cmd,
                                   flag_file=flag_file,
                                   debug=options.debug)
Exemple #2
0
def rsem(inputs, outputs):
    """
    :params inputs: a tuple of 1 or 2 fastq.gz files, e.g.
    ('/path/to/rsem_output/homo_sapiens/GSE50599/GSM1224499/SRR968078_1.fastq.gz',
     '/path/to/rsem_output/homo_sapiens/GSE50599/GSM1224499/SRR968078_2.fastq.gz')
    """
    inputs = [_ for _ in inputs if not _.endswith('.sra2fastq.COMPLETE')]
    # this is equivalent to the sample.outdir or GSM dir
    outdir = os.path.dirname(inputs[0])

    # the names of parameters are the same as that in gen_qsub_script, but
    # their values are more or less different, so better keep them separate
    fastq_gz_input = gen_fastq_gz_input(inputs)
    res = re.search(PATH_RE, outdir)
    gse = res.group('GSE')
    species = res.group('species')
    gsm = res.group('GSM')
    reference_name = config['LOCAL_REFERENCE_NAMES'][species]
    sample_name = '{outdir}/{gsm}'.format(**locals())
    n_jobs = options.j_rsem

    flag_file = outputs[-1]
    cmd = config['CMD_RSEM'].format(
        n_jobs=n_jobs,
        fastq_gz_input=fastq_gz_input,
        reference_name=reference_name,
        sample_name=sample_name,
        output_dir=outdir)
    misc.execute_log_stdout_stderr(cmd, flag_file=flag_file, debug=options.debug)
Exemple #3
0
def sra2fastq(inputs, outputs):
    """for meaning of [SED]RR, see
    http://www.ncbi.nlm.nih.gov/books/NBK56913/#search.the_entrez_sra_search_response_pa

    S =NCBI-SRA, E = EMBL-SRA, D = DDBJ-SRA
    SRR: SRA run accession
    ERR: ERA run accession
    DRR: DRA run accession
    """
    sra, _ = inputs             # ignore the flag file from previous task
    flag_file = outputs[-1]
    outdir = os.path.dirname(os.path.dirname(os.path.dirname(sra)))
    cmd = config['CMD_FASTQ_DUMP'].format(output_dir=outdir, accession=sra)
    misc.execute_log_stdout_stderr(cmd, flag_file=flag_file, debug=options.debug)
Exemple #4
0
def sra2fastq(inputs, outputs):
    """for meaning of [SED]RR, see
    http://www.ncbi.nlm.nih.gov/books/NBK56913/#search.the_entrez_sra_search_response_pa

    S =NCBI-SRA, E = EMBL-SRA, D = DDBJ-SRA
    SRR: SRA run accession
    ERR: ERA run accession
    DRR: DRA run accession
    """
    sra, _ = inputs  # ignore the flag file from previous task
    flag_file = outputs[-1]
    outdir = os.path.dirname(os.path.dirname(os.path.dirname(sra)))
    cmd = config['CMD_FASTQ_DUMP'].format(output_dir=outdir, accession=sra)
    misc.execute_log_stdout_stderr(cmd,
                                   flag_file=flag_file,
                                   debug=options.debug)
Exemple #5
0
def main():
    options = parse_args_for_rp_transfer()
    config = misc.get_config(options.config_file)

    # r_: means relevant to remote host, l_: to local host
    l_top_outdir = config['LOCAL_TOP_OUTDIR']
    r_top_outdir = config['REMOTE_TOP_OUTDIR']

    G = PPR.gen_all_samples_from_soft_and_isamp
    samples = G(options.soft_files, options.isamp, config)
    PPR.init_sample_outdirs(samples, l_top_outdir)

    r_host, r_username = config['REMOTE_HOST'], config['USERNAME']
    fastq2rsem_ratio = config['FASTQ2RSEM_RATIO']
    r_cmd_df = config['REMOTE_CMD_DF']
    r_min_free = misc.ugly_usage(config['REMOTE_MIN_FREE'])
    r_max_usage = misc.ugly_usage(config['REMOTE_MAX_USAGE'])
    r_free_to_use  = calc_remote_free_space_to_use(
        r_host, r_username, r_top_outdir, l_top_outdir,
        r_cmd_df, r_max_usage, r_min_free, fastq2rsem_ratio)

    # tf: transfer/transferred
    tf_record = os.path.join(l_top_outdir, 'transferred_GSMs.txt')
    tf_gsms = get_gsms_transferred(tf_record)
    tf_gsms_bn = map(os.path.basename, tf_gsms)

    logger.info('Selecting samples to transfer based their estimated remote usage')
    gsms_to_tf = select_gsms_to_transfer(
        samples, tf_gsms_bn, l_top_outdir, r_free_to_use, fastq2rsem_ratio)

    if not gsms_to_tf:
        logger.info('Cannot find a GSM that fits the current disk usage rule')
        return

    logger.info('GSMs to transfer:')
    for k, gsm in enumerate(gsms_to_tf):
        logger.info('\t{0:3d} {1:30s} {2}'.format(k+1, gsm, gsm.outdir))

    gsms_to_tf_ids = [os.path.relpath(_.outdir, l_top_outdir)
                      for _ in gsms_to_tf]
    tf_script = write_transfer_sh(
        gsms_to_tf_ids, options.rsync_template, l_top_outdir,
        r_username, r_host, r_top_outdir)

    os.chmod(tf_script, stat.S_IRUSR | stat.S_IWUSR| stat.S_IXUSR)
    rcode = misc.execute_log_stdout_stderr(tf_script)

    if rcode == 0:
        # different from processing in rsempipeline.py, where the completion is
        # marked by .COMPLETE flags, but by writting the completed GSMs to
        # gsms_transfer_record
        append_transfer_record(gsms_to_tf_ids, tf_record)