コード例 #1
0
def main(logger):
    # get the argument parser and parse args
    parser = get_parser()
    args = parser.parse_args()

    # use the logger
    logger.info('Attempting to echo the message...')

    # run a subprocess and log the attempt
    log_command(logger, 'echo {}'.format(args.message), shell=True)
コード例 #2
0
def run_htseq(dest_dir, sjdb_gtf, id_attr, logger):
    command = [
        HTSEQ,
        "-r",
        "name",
        "-s",
        "no",
        "-f",
        "bam",
        f"--idattr={id_attr}",
        "-m",
        "intersection-nonempty",
        os.path.join(dest_dir, "results", "Pass1",
                     "Aligned.out.sorted-byname.bam"),
        sjdb_gtf,
        ">",
        "htseq-count.txt",
    ]
    failed = ut_log.log_command(
        logger,
        command,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        shell=True,
        cwd=os.path.join(dest_dir, "results"),
    )

    return failed
コード例 #3
0
ファイル: velocyto.py プロジェクト: danledinh/czb_utils
def run_sample(
    sample_key,
    mask_path,
    gtf_path,
    s3_input_bucket,
    s3_output_bucket,
    s3_output_prefix,
    run_dir,
    logger,
):

    t_config = TransferConfig(num_download_attempts=25)

    sample_name = os.path.basename(sample_key)
    sample_id = sample_name.split(".")[0]  # this is brittle!
    local_sample = os.path.join(run_dir, "input", sample_name)

    s3c.download_file(Bucket=s3_input_bucket,
                      Key=sample_key,
                      Filename=local_sample,
                      Config=t_config)

    veloctyo_command = [
        "velocyto",
        "run-smartseq2",
        "-o",
        run_dir,
        "-m",
        mask_path,
        "-e",
        sample_id,
        local_sample,
        gtf_path,
    ]

    if ut_log.log_command(
            logger,
            veloctyo_command,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            shell=True,
    ):
        logger.info(f"velocyto failed on {sample_id}")
        os.remove(local_sample)
        return

    output_file = os.path.join(run_dir, f"{sample_id}.loom")

    logger.info("Uploading {}".format(output_file))
    time.sleep(10)
    s3c.upload_file(
        Filename=output_file,
        Bucket=s3_output_bucket,
        Key=os.path.join(s3_output_prefix, f"{sample_id}.loom"),
        Config=t_config,
    )

    os.remove(local_sample)
    os.remove(output_file)
コード例 #4
0
def run_htseq(dest_dir, sjdb_gtf, id_attr, logger):
    """ Run alignment job with htseq.

        dest_dir - Path local to the machine on EC2 under which alignment results
                   are stored before uploaded to S3. Child path of run_dir/sample_name
        sjdb_gtf - Path of reference genome .gtf files used to detect splice junctions
        id_attr - Determine naming format in the count file for different genomes
        logger - Logger object that exposes the interface the code directly uses

        Return FAILED, a boolean value of whether the alignment run fails
    """

    htseq_command = [
        HTSEQ,
        "-r",
        "name",
        "-s",
        "no",
        "-f",
        "bam",
        f"--idattr={id_attr}",
        "-m",
        "intersection-nonempty",
        os.path.join(dest_dir, "results", "Pass1",
                     "Aligned.out.sorted-byname.bam"),
        sjdb_gtf,
        ">",
        "htseq-count.txt",
    ]
    failed = ut_log.log_command(
        logger,
        htseq_command,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        shell=True,
        cwd=os.path.join(dest_dir, "results"),
    )

    return failed
コード例 #5
0
def main(logger):
    parser = get_parser()

    args = parser.parse_args()

    if os.environ.get('AWS_BATCH_JOB_ID'):
        args.root_dir = os.path.join(args.root_dir,
                                     os.environ['AWS_BATCH_JOB_ID'])

    # local directories
    sample_id = os.path.basename(args.s3_input_dir)
    result_path = os.path.join(args.root_dir, 'data', 'hca', sample_id)
    fastq_path = os.path.join(result_path, 'fastqs')
    os.makedirs(fastq_path)

    genome_base_dir = os.path.join(args.root_dir, "genome", "cellranger")
    os.makedirs(genome_base_dir)

    if args.taxon == 'h**o':
        genome_name = 'HG38-PLUS'
    elif args.taxon == 'mus':
        genome_name = 'MM10-PLUS'
    else:
        raise ValueError("unknown taxon {}".format(args.taxon))

    # files that should be uploaded outside of the massive tgz
    # path should be relative to the run folder
    files_to_upload = [
        'outs/raw_gene_bc_matrices_h5.h5',
        'outs/raw_gene_bc_matrices/{}/genes.tsv'.format(genome_name),
        'outs/raw_gene_bc_matrices/{}/barcodes.tsv'.format(genome_name),
        'outs/raw_gene_bc_matrices/{}/matrix.mtx'.format(genome_name),
        'outs/web_summary.html', 'outs/metrics_summary.csv'
    ]

    genome_tar_source = os.path.join('s3://czi-hca/ref-genome/cellranger/',
                                     genome_name + '.tgz')
    genome_dir = os.path.join(genome_base_dir, genome_name)

    # download the ref genome data
    command = [
        'aws', 's3', 'cp', '--quiet', genome_tar_source, genome_base_dir
    ]
    log_command(logger, command, shell=True)

    genome_tar_file = os.path.basename(genome_tar_source)
    logger.debug('Extracting {}'.format(genome_tar_file))
    with tarfile.open(os.path.join(genome_base_dir, genome_tar_file)) as tf:
        tf.extractall(path=genome_base_dir)

    sys.stdout.flush()

    # download the fastq files
    command = [
        'aws', 's3', 'cp', '--no-progress', '--recursive',
        '--force-glacier-transfer' if args.glacier else '', args.s3_input_dir,
        fastq_path
    ]
    log_command(logger, command, shell=True)

    # Run cellranger
    os.chdir(result_path)
    command = [
        CELLRANGER, 'count', '--localmem=240', '--nosecondary', '--disable-ui',
        '--expect-cells={}'.format(args.cell_count),
        '--id={}'.format(sample_id), '--fastqs={}'.format(fastq_path),
        '--transcriptome={}'.format(genome_dir)
    ]
    log_command(logger,
                command,
                shell=True,
                stderr=subprocess.STDOUT,
                universal_newlines=True)

    # Move results(websummary, cell-gene table, tarball) data back to S3
    for file_name in files_to_upload:
        command = [
            'aws', 's3', 'cp', '--quiet',
            os.path.join(result_path, sample_id, file_name),
            '{}/'.format(args.s3_output_dir)
        ]
        for i in range(S3_RETRY):
            try:
                log_command(logger, command, shell=True)
                break
            except subprocess.CalledProcessError:
                logger.info("retrying cp {}".format(file_name))
        else:
            raise RuntimeError("couldn't sync {}".format(file_name))

    command = [
        'tar', 'cvzf', '{}.tgz'.format(os.path.join(result_path, sample_id)),
        sample_id
    ]
    log_command(logger, command, shell=True)

    command = [
        'aws', 's3', 'cp', '--quiet',
        '{}.tgz'.format(os.path.join(result_path, sample_id)),
        '{}/'.format(args.s3_output_dir)
    ]
    for i in range(S3_RETRY):
        try:
            log_command(logger, command, shell=True)
            break
        except subprocess.CalledProcessError:
            logger.info("retrying cp {}.tgz".format(sample_id))
    else:
        raise RuntimeError("couldn't sync {}.tgz".format(sample_id))
コード例 #6
0
def main(logger):
    """ Download reference genome, run alignment jobs, and upload results to S3.

        logger - Logger object that exposes the interface the code directly uses
    """

    parser = get_parser()

    args = parser.parse_args()

    if os.environ.get("AWS_BATCH_JOB_ID"):
        root_dir = os.path.join("/mnt", os.environ["AWS_BATCH_JOB_ID"])
    else:
        root_dir = "/mnt"

    # local directories
    if args.s3_input_path.endswith("/"):
        args.s3_input_path = args.s3_input_path[:-1]

    run_dir = os.path.join(root_dir, "data")
    os.makedirs(run_dir)

    # check if the input genome and region are valid
    if args.taxon in reference_genomes:
        if args.taxon in deprecated:
            logger.warn(
                f"The name '{args.taxon}' will be removed in the future,"
                f" start using '{deprecated[args.taxon]}'")

        genome_name = reference_genomes[args.taxon]
    else:
        raise ValueError(f"unknown taxon {args.taxon}")

    if args.taxon == "gencode.vM19" or args.taxon == "gencode.vM19.ERCC":
        id_attr = "gene_name"
    else:
        id_attr = "gene_id"

    genome_dir = os.path.join(root_dir, "genome", "STAR", genome_name)
    ref_genome_star_file = f"STAR/{genome_name}.tgz"
    sjdb_gtf = os.path.join(root_dir, f"{genome_name}.gtf")

    if args.region != "west" and genome_name not in ("HG38-PLUS", "MM10-PLUS"):
        raise ValueError(f"you must use --region west for {genome_name}")

    if args.region == "east":
        ref_genome_star_file = os.path.join("ref-genome", ref_genome_star_file)

    s3_input_bucket, s3_input_prefix = s3u.s3_bucket_and_key(
        args.s3_input_path)

    logger.info(
        f"""Run Info: partition {args.partition_id} out of {args.num_partitions}
                   genome_dir:\t{genome_dir}
         ref_genome_star_file:\t{ref_genome_star_file}
                     sjdb_gtf:\t{sjdb_gtf}
                      id_attr:\t{id_attr}
                        taxon:\t{args.taxon}
                s3_input_path:\t{args.s3_input_path}""")

    s3 = boto3.resource("s3")

    # download the reference genome data
    os.mkdir(os.path.join(root_dir, "genome"))
    logger.info("Downloading and extracting gtf data {}".format(sjdb_gtf))

    s3c.download_file(
        Bucket=S3_REFERENCE[
            "west"],  # just always download this from us-west-2...
        Key=f"velocyto/{genome_name}.gtf",
        Filename=sjdb_gtf,
    )

    os.mkdir(os.path.join(root_dir, "genome", "STAR"))
    logger.info(
        "Downloading and extracting STAR data {}".format(ref_genome_star_file))

    s3_object = s3.Object(S3_REFERENCE[args.region], ref_genome_star_file)

    with tarfile.open(fileobj=s3_object.get()["Body"], mode="r|gz") as tf:
        tf.extractall(path=os.path.join(root_dir, "genome", "STAR"))

    # Load Genome Into Memory
    command = [STAR, "--genomeDir", genome_dir, "--genomeLoad", "LoadAndExit"]
    if ut_log.log_command(logger,
                          command,
                          stdout=subprocess.PIPE,
                          stderr=subprocess.STDOUT,
                          shell=True):
        raise RuntimeError("Failed to load genome into memory")

    sample_re = re.compile("([^/]+)_R\d(?:_\d+)?.fastq.gz$")
    s3_output_bucket, s3_output_prefix = s3u.s3_bucket_and_key(
        args.s3_output_path)

    logger.info("Running partition {} of {}".format(args.partition_id,
                                                    args.num_partitions))

    # Check the input folder for existing runs
    if not args.force_realign:
        output = s3u.prefix_gen(s3_output_bucket, s3_output_prefix, lambda r:
                                (r["LastModified"], r["Key"]))
    else:
        output = []

    output_files = {
        tuple(os.path.basename(fn).rsplit(".", 2)[0].split(".", 1)[:2])
        for dt, fn in output
        if fn.endswith(".htseq-count.txt") and dt > CURR_MIN_VER
    }

    logger.info("Skipping {} existing results".format(len(output_files)))

    sample_files = [(fn, s)
                    for fn, s in s3u.get_size(s3_input_bucket, s3_input_prefix)
                    if fn.endswith("fastq.gz")]

    sample_lists = defaultdict(list)
    sample_sizes = defaultdict(list)

    for fn, s in sample_files:
        matched = sample_re.search(os.path.basename(fn))
        if matched:
            sample_lists[matched.group(1)].append(fn)
            sample_sizes[matched.group(1)].append(s)

    logger.info(f"number of samples: {len(sample_lists)}")

    for sample_name in sorted(
            sample_lists)[args.partition_id::args.num_partitions]:
        if (sample_name, args.taxon) in output_files:
            logger.debug(f"{sample_name} already exists, skipping")
            continue

        if sum(sample_sizes[sample_name]) < args.min_size:
            logger.info(f"{sample_name} is below min_size, skipping")
            continue

        failed, dest_dir = run_sample(
            s3_input_bucket,
            sample_name,
            sorted(sample_lists[sample_name]),
            genome_dir,
            run_dir,
            args.star_proc,
            logger,
        )

        failed = failed or run_htseq(dest_dir, sjdb_gtf, id_attr, logger)

        if not failed:
            upload_results(sample_name, args.taxon, dest_dir,
                           args.s3_output_path, logger)

        command = ["rm", "-rf", dest_dir]
        ut_log.log_command(logger, command, shell=True)

        time.sleep(30)

    logger.info("Job completed")
コード例 #7
0
def run_sample(s3_input_bucket, sample_name, sample_fns, genome_dir, run_dir,
               star_proc, logger):
    """ Run alignment jobs with STAR.

        s3_input_bucket - Name of the bucket with input fastq files to align
        sample_name - Sequenced sample name (joined by "_")
        sample_fns - Sample file names. Each file name is concatenated by sample_name,
                     "_R1_" or"_R2_", a number, and ".fastq.gz"
        genome_dir - Path to reference genome
        run_dir - Path local to the machine on EC2 under which alignment results
                  are stored before uploaded to S3
        star_proc - Number of processes to give to each STAR run
        logger - Logger object that exposes the interface the code directly uses

        Return two values. FAILED is a boolean value of whether the alignment run
        fails. DEST_DIR is the path under which STAR alignment results are stored.
        This path is local to the machine on EC2 running the alignment, and that's
        where we copy the alignment results to upload to S3 later.
    """

    t_config = TransferConfig(use_threads=False, num_download_attempts=25)

    dest_dir = os.path.join(run_dir, sample_name)

    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
        os.mkdir(os.path.join(dest_dir, "rawdata"))
        os.mkdir(os.path.join(dest_dir, "results"))
        os.mkdir(os.path.join(dest_dir, "results", "Pass1"))

    for sample_fn in sample_fns:
        s3c.download_file(
            Bucket=s3_input_bucket,
            Key=sample_fn,
            Filename=os.path.join(dest_dir, os.path.basename(sample_fn)),
            Config=t_config,
        )

    # start running STAR
    # getting input files first

    reads = sorted(
        os.path.join(dest_dir, os.path.basename(sample_fn))
        for sample_fn in sample_fns)

    input_command = COMMON_PARS[:]
    input_command.extend((
        "--runThreadN",
        str(star_proc),
        "--genomeDir",
        genome_dir,
        "--readFilesIn",
        " ".join(reads),
    ))
    failed = ut_log.log_command(
        logger,
        input_command,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        shell=True,
        cwd=os.path.join(dest_dir, "results", "Pass1"),
    )

    # running sam tools
    sample_command = [
        SAMTOOLS,
        "sort",
        "-m",
        "6000000000",
        "-o",
        "./Pass1/Aligned.out.sorted.bam",
        "./Pass1/Aligned.out.bam",
    ]
    failed = failed or ut_log.log_command(
        logger,
        sample_command,
        shell=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        cwd=os.path.join(dest_dir, "results"),
    )

    # running samtools index -b
    sample_index_command = [SAMTOOLS, "index", "-b", "Aligned.out.sorted.bam"]
    failed = failed or ut_log.log_command(
        logger,
        sample_index_command,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        shell=True,
        cwd=os.path.join(dest_dir, "results", "Pass1"),
    )

    # generating files for htseq-count
    output_command = [
        SAMTOOLS,
        "sort",
        "-m",
        "6000000000",
        "-n",
        "-o",
        "./Pass1/Aligned.out.sorted-byname.bam",
        "./Pass1/Aligned.out.sorted.bam",
    ]
    failed = failed or ut_log.log_command(
        logger,
        output_command,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        shell=True,
        cwd=os.path.join(dest_dir, "results"),
    )

    return failed, dest_dir
コード例 #8
0
ファイル: 10x_mkfastq.py プロジェクト: bingwu2017/utilities
def main(logger):
    parser = get_parser()

    args = parser.parse_args()

    if os.environ.get("AWS_BATCH_JOB_ID"):
        args.root_dir = os.path.join(args.root_dir,
                                     os.environ["AWS_BATCH_JOB_ID"])

    if args.sample_sheet_name is None:
        args.sample_sheet_name = "{}.csv".format(args.exp_id)

    # local directories
    result_path = os.path.join(args.root_dir, "data", "hca", args.exp_id)
    bcl_path = os.path.join(result_path, "bcl")
    output_path = os.path.join(result_path, "fastqs")

    os.makedirs(result_path)
    os.mkdir(bcl_path)

    # download sample sheet
    command = [
        "aws",
        "s3",
        "cp",
        "--quiet",
        os.path.join(args.s3_sample_sheet_dir, args.sample_sheet_name),
        result_path,
    ]
    for i in range(S3_RETRY):
        if not log_command(logger, command, shell=True):
            break
        logger.info("retrying s3 copy")
    else:
        raise RuntimeError("couldn't download sample sheet {}".format(
            os.path.join(args.s3_sample_sheet_dir, args.sample_sheet_name)))

    # download the bcl files
    command = [
        "aws",
        "s3",
        "sync",
        "--quiet",
        os.path.join(args.s3_input_dir, args.exp_id),
        bcl_path,
    ]
    for i in range(S3_RETRY):
        if not log_command(logger, command, shell=True):
            break
        logger.info("retrying s3 sync bcl")
    else:
        raise RuntimeError("couldn't sync {}".format(
            os.path.join(args.s3_input_dir, args.exp_id)))

    # Run cellranger mkfastq
    command = [
        CELLRANGER,
        "mkfastq",
        "--localmem=60",
        "--sample-sheet={}".format(
            os.path.join(result_path, args.sample_sheet_name)),
        "--run={}".format(os.path.join(bcl_path)),
        "--output-dir={}".format(output_path),
    ]

    if log_command(logger,
                   command,
                   stdout=subprocess.PIPE,
                   stderr=subprocess.STDOUT,
                   shell=True):
        raise RuntimeError("cellranger mkfastq failed")

    # upload fastq files to destination folder
    command = [
        "aws",
        "s3",
        "sync",
        "--quiet",
        output_path,
        os.path.join(args.s3_output_dir, args.exp_id),
    ]
    for i in range(S3_RETRY):
        if not log_command(logger, command, shell=True):
            break
        logger.info("retrying sync fastq")
    else:
        raise RuntimeError("couldn't sync fastqs")
コード例 #9
0
def main(logger):
    parser = get_parser()

    args = parser.parse_args()

    if os.environ.get('AWS_BATCH_JOB_ID'):
        args.root_dir = os.path.join(args.root_dir,
                                     os.environ['AWS_BATCH_JOB_ID'])


    if args.sample_sheet_name is None:
        args.sample_sheet_name = '{}.csv'.format(args.exp_id)

    # local directories
    result_path = os.path.join(args.root_dir, 'data', 'hca', args.exp_id)
    bcl_path = os.path.join(result_path, 'bcl')
    output_path = os.path.join(result_path, 'fastqs')

    os.makedirs(result_path)
    os.mkdir(bcl_path)

    # download sample sheet
    command = ['aws', 's3', 'cp', '--quiet',
               os.path.join(args.s3_sample_sheet_dir, args.sample_sheet_name),
               result_path]
    for i in range(S3_RETRY):
        try:
            log_command(logger, command, shell=True)
            break
        except subprocess.CalledProcessError:
            logger.info("retrying s3 copy")
    else:
        raise RuntimeError("couldn't download sample sheet {}".format(
                os.path.join(args.s3_sample_sheet_dir, args.sample_sheet_name))
        )


    # download the bcl files
    command = ['aws', 's3', 'sync', '--quiet',
               os.path.join(args.s3_input_dir, args.exp_id), bcl_path]
    for i in range(S3_RETRY):
        try:
            log_command(logger, command, shell=True)
            break
        except subprocess.CalledProcessError:
            logger.info("retrying s3 sync bcl")
    else:
        raise RuntimeError("couldn't sync {}".format(
                os.path.join(args.s3_input_dir, args.exp_id))
        )


    # Run cellranger mkfastq
    command = [CELLRANGER, 'mkfastq', '--localmem=60',
               '--sample-sheet={}'.format(os.path.join(result_path, args.sample_sheet_name)),
               '--run={}'.format(os.path.join(bcl_path)),
               '--output-dir={}'.format(output_path)]
    log_command(logger, command, shell=True)


    # upload fastq files to destination folder
    command = ['aws', 's3', 'sync', '--quiet',
               output_path, os.path.join(args.s3_output_dir, args.exp_id)]
    for i in range(S3_RETRY):
        try:
            log_command(logger, command, shell=True)
            break
        except subprocess.CalledProcessError:
            logger.info("retrying sync fastq")
    else:
        raise RuntimeError("couldn't sync fastqs")
コード例 #10
0
def main(logger):
    parser = get_parser()

    args = parser.parse_args()

    if os.environ.get('AWS_BATCH_JOB_ID'):
        root_dir = os.path.join('/mnt', os.environ['AWS_BATCH_JOB_ID'])
    else:
        root_dir = '/mnt'

    run_dir = os.path.join(root_dir, 'data', 'hca')
    os.makedirs(run_dir)

    if args.taxon == 'h**o':
        genome_dir = os.path.join(root_dir, "genome/STAR/HG38-PLUS/")
        ref_genome_file = 'hg38-plus.tgz'
        ref_genome_star_file = 'STAR/HG38-PLUS.tgz'
        sjdb_gtf = os.path.join(root_dir, 'genome', 'hg38-plus', 'hg38-plus.gtf')
    elif args.taxon == 'mus':
        genome_dir = os.path.join(root_dir, "genome/STAR/MM10-PLUS/")
        ref_genome_file = 'mm10-plus.tgz'
        ref_genome_star_file = 'STAR/MM10-PLUS.tgz'
        sjdb_gtf = os.path.join(root_dir, 'genome', 'mm10-plus', 'mm10-plus.gtf')

    else:
        raise ValueError('Invalid taxon {}'.format(args.taxon))

    if args.star_proc > mp.cpu_count():
        raise ValueError('Not enough CPUs to give {} processes to STAR'.format(
                args.star_proc))

    s3_input_bucket,s3_input_prefix = s3u.s3_bucket_and_key(args.s3_input_path)

    logger.info(
            '''Run Info: partition {} out of {}
                   star_proc:\t{}
                  htseq_proc:\t{}
                  genome_dir:\t{}
             ref_genome_file:\t{}
        ref_genome_star_file:\t{}
                    sjdb_gtf:\t{}
                       taxon:\t{}
               s3_input_path:\t{}
                  input_dirs:\t{}'''.format(
                    args.partition_id, args.num_partitions,
                    args.star_proc, args.htseq_proc,
                    genome_dir, ref_genome_file,
                    ref_genome_star_file, sjdb_gtf,
                    args.taxon, args.s3_input_path,
                    ', '.join(args.input_dirs)
            )
    )


    s3 = boto3.resource('s3')

    # download the genome data
    os.mkdir(os.path.join(root_dir, 'genome'))
    logger.info('Downloading and extracting genome data {}'.format(ref_genome_file))

    object = s3.Object('czbiohub-reference', ref_genome_file)

    with tarfile.open(fileobj=object.get()['Body'], mode='r:gz') as tf:
        tf.extractall(path=os.path.join(root_dir, 'genome'))


    # download STAR stuff
    os.mkdir(os.path.join(root_dir, 'genome', 'STAR'))
    logger.info('Downloading and extracting STAR data {}'.format(ref_genome_star_file))

    object = s3.Object('czbiohub-reference', ref_genome_star_file)

    with tarfile.open(fileobj=object.get()['Body'], mode='r:gz') as tf:
        tf.extractall(path=os.path.join(root_dir, 'genome', 'STAR'))


    # Load Genome Into Memory
    command = [STAR, '--genomeDir', genome_dir, '--genomeLoad', 'LoadAndExit']
    ut_log.log_command(logger, command, shell=True)

    log_queue, log_thread = ut_log.get_thread_logger(logger)

    star_queue = mp.Queue()
    htseq_queue = mp.Queue()

    n_star_procs = mp.cpu_count() // args.star_proc

    star_args = (star_queue, htseq_queue, log_queue, s3_input_bucket,
                 genome_dir, run_dir, args.star_proc)
    star_procs = [mp.Process(target=run_sample, args=star_args)
                  for i in range(n_star_procs)]

    for p in star_procs:
        p.start()

    htseq_args = (htseq_queue, log_queue,
                  args.s3_input_path, args.s3_output_path,
                  args.taxon, sjdb_gtf)
    htseq_procs = [mp.Process(target=run_htseq, args=htseq_args)
                   for i in range(args.htseq_proc)]

    for p in htseq_procs:
        p.start()


    sample_re = re.compile("([^/]+)_R\d_\d+.fastq.gz$")

    for input_dir in args.input_dirs:
        if args.s3_output_path is None:
            s3_output_path = os.path.join(args.s3_input_path, input_dir, 'results')
        else:
            s3_output_path = args.s3_output_path

        s3_output_bucket,s3_output_prefix = s3u.s3_bucket_and_key(s3_output_path)

        # Check the input_dir folder for existing runs
        if not args.force_realign:
            output = s3u.prefix_gen(s3_output_bucket, s3_output_prefix,
                                    lambda r: (r['LastModified'], r['Key']))
        else:
            output = []

        output_files = {tuple(os.path.basename(fn).split('.')[:2]) for dt,fn in output
                        if fn.endswith('htseq-count.txt') and dt > CURR_MIN_VER}

        logger.info("Skipping {} existing results".format(len(output_files)))

        logger.info("Running partition {} of {} for {}".format(
                args.partition_id, args.num_partitions, input_dir)
        )

        output = [
            fn for fn in s3u.get_files(s3_input_bucket,
                                       os.path.join(s3_input_prefix, input_dir))
            if fn.endswith('fastq.gz')
        ]

        logger.info("number of fastq.gz files: {}".format(len(output)))

        sample_lists = defaultdict(list)

        for fn in output:
            matched = sample_re.search(os.path.basename(fn))
            if matched:
                sample_lists[matched.group(1)].append(fn)

        for sample_name in sorted(sample_lists)[args.partition_id::args.num_partitions]:
            if (sample_name, args.taxon) in output_files:
                logger.info("{} already exists, skipping".format(sample_name))
                continue

            logger.info("Adding sample {} to queue".format(sample_name))
            star_queue.put((input_dir, sample_name, sorted(sample_lists[sample_name])))

    for i in range(n_star_procs):
        star_queue.put('STOP')

    for p in star_procs:
        p.join()

    for i in range(args.htseq_proc):
        htseq_queue.put('STOP')

    for p in htseq_procs:
        p.join()

    log_queue.put('STOP')
    log_thread.join()

    # Remove Genome from Memory
    command = [STAR, '--genomeDir', genome_dir, '--genomeLoad', 'Remove']
    ut_log.log_command(logger, command, shell=True)

    logger.info('Job completed')
コード例 #11
0
ファイル: 10x_count.py プロジェクト: danledinh/czb_utils
def main(logger):
    parser = get_parser()

    args = parser.parse_args()

    if os.environ.get("AWS_BATCH_JOB_ID"):
        args.root_dir = os.path.join(args.root_dir,
                                     os.environ["AWS_BATCH_JOB_ID"])

    # local directories
    if args.s3_input_dir.endswith("/"):
        args.s3_input_dir = args.s3_input_dir[:-1]

    sample_id = os.path.basename(args.s3_input_dir)
    result_path = os.path.join(args.root_dir, "data", "hca", sample_id)
    fastq_path = os.path.join(result_path, "fastqs")
    os.makedirs(fastq_path)

    genome_base_dir = os.path.join(args.root_dir, "genome", "cellranger")
    os.makedirs(genome_base_dir)

    if args.taxon in reference_genomes:
        if args.taxon in deprecated:
            logger.warn(f"'{args.taxon}' will be removed in the future,"
                        f" use '{reference_genomes[args.taxon]}'")

        genome_name = reference_genomes[args.taxon]
    else:
        raise ValueError(f"unknown taxon {args.taxon}")

    if args.region != "west" and genome_name not in ("HG38-PLUS", "MM10-PLUS"):
        raise ValueError(f"you must use --region west for {genome_name}")

    s3 = boto3.resource("s3")

    # download the ref genome data
    logger.info(f"Downloading and extracting genome data {genome_name}")

    if args.region == "east":
        s3_object = s3.Object(S3_REFERENCE[args.region],
                              f"ref-genome/cellranger/{genome_name}.tgz")
    else:
        s3_object = s3.Object(S3_REFERENCE[args.region],
                              f"cellranger/{genome_name}.tgz")

    with tarfile.open(fileobj=s3_object.get()["Body"], mode="r|gz") as tf:
        tf.extractall(path=genome_base_dir)

    genome_dir = os.path.join(genome_base_dir, genome_name)

    sys.stdout.flush()

    # download the fastq files
    command = [
        "aws",
        "s3",
        "cp",
        "--no-progress",
        "--recursive",
        "--force-glacier-transfer" if args.glacier else "",
        args.s3_input_dir,
        fastq_path,
    ]
    log_command(logger, command, shell=True)

    # Run cellranger
    os.chdir(result_path)
    command = [
        CELLRANGER,
        "count",
        "--localmem=240",
        "--nosecondary",
        "--disable-ui",
        f"--expect-cells={args.cell_count}",
        f"--id={sample_id}",
        f"--fastqs={fastq_path}",
        f"--transcriptome={genome_dir}",
    ]
    failed = log_command(
        logger,
        command,
        shell=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        universal_newlines=True,
    )

    if failed:
        raise RuntimeError("cellranger count failed")

    # Move outs folder to S3
    command = [
        "aws",
        "s3",
        "sync",
        "--no-progress",
        os.path.join(result_path, sample_id, "outs"),
        args.s3_output_dir,
    ]
    for i in range(S3_RETRY):
        if not log_command(logger, command, shell=True):
            break
        logger.info(f"retrying sync")
    else:
        raise RuntimeError(f"couldn't sync output")
コード例 #12
0
def run_sample(s3_input_bucket, sample_name, sample_fns, genome_dir, run_dir,
               star_proc, logger):
    t_config = TransferConfig(use_threads=False, num_download_attempts=25)

    dest_dir = os.path.join(run_dir, sample_name)

    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
        os.mkdir(os.path.join(dest_dir, "rawdata"))
        os.mkdir(os.path.join(dest_dir, "results"))
        os.mkdir(os.path.join(dest_dir, "results", "Pass1"))

    for sample_fn in sample_fns:
        s3c.download_file(
            Bucket=s3_input_bucket,
            Key=sample_fn,
            Filename=os.path.join(dest_dir, os.path.basename(sample_fn)),
            Config=t_config,
        )

    # start running STAR
    # getting input files first

    reads = sorted(
        os.path.join(dest_dir, os.path.basename(sample_fn))
        for sample_fn in sample_fns)

    command = COMMON_PARS[:]
    command.extend((
        "--runThreadN",
        str(star_proc),
        "--genomeDir",
        genome_dir,
        "--readFilesIn",
        " ".join(reads),
    ))
    failed = ut_log.log_command(
        logger,
        command,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        shell=True,
        cwd=os.path.join(dest_dir, "results", "Pass1"),
    )

    # running sam tools
    command = [
        SAMTOOLS,
        "sort",
        "-m",
        "6000000000",
        "-o",
        "./Pass1/Aligned.out.sorted.bam",
        "./Pass1/Aligned.out.bam",
    ]
    failed = failed or ut_log.log_command(
        logger,
        command,
        shell=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        cwd=os.path.join(dest_dir, "results"),
    )

    # running samtools index -b
    command = [SAMTOOLS, "index", "-b", "Aligned.out.sorted.bam"]
    failed = failed or ut_log.log_command(
        logger,
        command,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        shell=True,
        cwd=os.path.join(dest_dir, "results", "Pass1"),
    )

    # generating files for htseq-count
    command = [
        SAMTOOLS,
        "sort",
        "-m",
        "6000000000",
        "-n",
        "-o",
        "./Pass1/Aligned.out.sorted-byname.bam",
        "./Pass1/Aligned.out.sorted.bam",
    ]
    failed = failed or ut_log.log_command(
        logger,
        command,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        shell=True,
        cwd=os.path.join(dest_dir, "results"),
    )

    return failed, dest_dir
コード例 #13
0
ファイル: bcl2fastq.py プロジェクト: bingwu2017/utilities
def main(logger):
    parser = get_parser()

    args = parser.parse_args()

    if os.environ.get("AWS_BATCH_JOB_ID"):
        root_dir = os.path.join("/mnt", os.environ["AWS_BATCH_JOB_ID"])
    else:
        root_dir = "/mnt"

    if args.sample_sheet_name is None:
        args.sample_sheet_name = "{}.csv".format(args.exp_id)

    # local directories
    result_path = os.path.join(root_dir, "data", "hca", args.exp_id)
    bcl_path = os.path.join(result_path, "bcl")
    output_path = os.path.join(result_path, "fastqs")

    # download sample sheet
    os.makedirs(result_path)
    os.mkdir(bcl_path)

    command = [
        "aws",
        "s3",
        "cp",
        "--quiet",
        os.path.join(args.s3_sample_sheet_dir, args.sample_sheet_name),
        result_path,
    ]
    for i in range(S3_RETRY):
        if not log_command(logger, command, shell=True):
            break
        logger.info("retrying s3 copy")
    else:
        raise RuntimeError("couldn't download sample sheet {}".format(
            os.path.join(args.s3_sample_sheet_dir, args.sample_sheet_name)))

    # download the bcl files
    command = [
        "aws",
        "s3",
        "sync",
        "--quiet",
        "--force-glacier-transfer" if args.force_glacier else "",
        os.path.join(args.s3_input_dir, args.exp_id),
        bcl_path,
    ]
    for i in range(S3_RETRY):
        if not log_command(logger, command, shell=True):
            break
        logger.info("retrying s3 sync bcl")
    else:
        raise RuntimeError("couldn't sync {}".format(
            os.path.join(args.s3_input_dir, args.exp_id)))

    command = (
        "while true;"
        ' do echo "memory usage" `cat /sys/fs/cgroup/memory/memory.usage_in_bytes`;'
        ' echo "disk usage" `df -h | grep "/mnt"`;'
        " sleep 300;"
        " done")
    p = subprocess.Popen([command], shell=True)

    # Run bcl2 fastq
    command = [
        BCL2FASTQ,
        " ".join(args.bcl2fastq_options),
        "--sample-sheet",
        os.path.join(result_path, args.sample_sheet_name),
        "-R",
        bcl_path,
        "-o",
        output_path,
    ]
    failed = log_command(logger,
                         command,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.STDOUT,
                         shell=True)
    if failed:
        p.kill()
        raise RuntimeError("bcl2fastq failed, see above for error")

    # fix directory structure of the files *before* sync!
    fastqgz_files = glob.glob(os.path.join(output_path, "*fastq.gz"))
    logger.debug("all fastq.gz files\n{}\n\n".format("\n".join(fastqgz_files)))

    for fastq_file in fastqgz_files:
        if args.skip_undetermined and os.path.basename(fastq_file).startswith(
                "Undetermined"):
            logger.info("removing {}".format(os.path.basename(fastq_file)))
            os.remove(fastq_file)
        elif args.star_structure:
            m = re.match("(.+)(_R[12]_001.fastq.gz)",
                         os.path.basename(fastq_file))
            if m:
                sample = m.group(1)
                if not os.path.exists(os.path.join(output_path, sample)):
                    logger.debug("creating {}".format(
                        os.path.join(output_path, sample)))
                    os.mkdir(os.path.join(output_path, sample))
                logger.debug("moving {}".format(fastq_file))
                os.rename(
                    fastq_file,
                    os.path.join(output_path, sample,
                                 os.path.basename(fastq_file)),
                )
            else:
                logger.warning(
                    "Warning: regex didn't match {}".format(fastq_file))

    sys.stdout.flush()

    # upload fastq files to destination folder
    command = [
        "aws",
        "s3",
        "sync",
        "--quiet",
        output_path,
        os.path.join(args.s3_output_dir, args.exp_id),
        "--exclude",
        '"*"',
        "--include",
        '"*fastq.gz"',
    ]
    for i in range(S3_RETRY):
        if not log_command(logger, command, shell=True):
            break
        logger.info("retrying sync fastq")
    else:
        raise RuntimeError("couldn't sync fastqs")

    # Move reports data back to S3
    reports_path = subprocess.check_output(
        "ls -d {}".format(
            os.path.join(output_path, "Reports", "html", "*", "all", "all",
                         "all")),
        shell=True,
    ).rstrip()
    command = [
        "aws",
        "s3",
        "cp",
        "--quiet",
        reports_path.decode(),
        os.path.join(args.s3_report_dir, args.exp_id),
        "--recursive",
    ]
    for i in range(S3_RETRY):
        if not log_command(logger, command, shell=True):
            break
        logger.info("retrying cp reports")
    else:
        raise RuntimeError("couldn't cp reports")

    p.kill()
コード例 #14
0
def main(logger):
    parser = get_parser()

    args = parser.parse_args()

    if os.environ.get('AWS_BATCH_JOB_ID'):
        root_dir = os.path.join(ROOT_DIR_PATH, os.environ['AWS_BATCH_JOB_ID'])
    else:
        root_dir = ROOT_DIR_PATH

    if args.sample_sheet_name is None:
        args.sample_sheet_name = '{}.csv'.format(args.exp_id)

    # local directories
    result_path = os.path.join(root_dir, 'data', 'hca', args.exp_id)
    bcl_path = os.path.join(result_path, 'bcl')
    output_path = os.path.join(result_path, 'fastqs')

    if not args.no_s3_download:
        # only make dirs if they don't exist yet
        if not os.path.isdir(result_path):
            os.makedirs(result_path)
        if not os.path.isdir(bcl_path):
            os.mkdir(bcl_path)

        # download sample sheet
        command = ['aws', 's3', 'cp', '--quiet',
                   os.path.join(args.s3_sample_sheet_dir, args.sample_sheet_name),
                   result_path]
        for i in range(S3_RETRY):
            try:
                log_command(logger, command, shell=True)
                break
            except subprocess.CalledProcessError:
                logger.info("retrying s3 copy")
        else:
            raise RuntimeError("couldn't download sample sheet {}".format(
                    os.path.join(args.s3_sample_sheet_dir, args.sample_sheet_name))
            )

        # do a check on the sample inputs to make sure we can get run IDs from all of them
        # change this if the Illumina sample sheet output ever changes; otherwise this line has the headers
        _SAMPLE_SHEET_STARTING_LINE = 21
        df_csv = pd.read_csv(os.path.join(result_path, args.sample_sheet_name), header=_SAMPLE_SHEET_STARTING_LINE)
        samples_not_matching_run_ids = [sample_name for sample_name in df_csv['Sample_ID'] if not _check_for_run_information(sample_name)]
        if len(samples_not_matching_run_ids) > 0:
            raise ValueError('Found sample names that I could not extract run ID values (of the form RunXX_YY) from: '
                             '{}'.format(samples_not_matching_run_ids))

        # download the bcl files
        command = ['aws', 's3', 'sync', '--quiet',
                   '--force-glacier-transfer' if args.force_glacier else '',
                   os.path.join(args.s3_input_dir, args.exp_id), bcl_path]
        for i in range(S3_RETRY):
            try:
                log_command(logger, command, shell=True)
                break
            except subprocess.CalledProcessError:
                logger.info("retrying s3 sync bcl")
        else:
            raise RuntimeError("couldn't sync {}".format(
                    os.path.join(args.s3_input_dir, args.exp_id))
            )


    # this is actually awful because the process forks and you have to go kill it yourself
    command = ('while true;'
               ' do memusage=`cat /sys/fs/cgroup/memory/memory.usage_in_bytes`;'
               ' memgb=`echo "${memusage}/(1000000000)" | bc -l | xargs -I {} printf "%.2f\n" {}`;'
               ' echo "memory usage: ${memgb}GB";'
               ' echo "disk usage: " `df -h | grep -e "/$" | awk \'{print $(NF-4)" "$(NF-3)" "$(NF-2)" "$(NF-1)" "$NF}\''
               ' sleep 90;'
               ' done')
    p = subprocess.Popen([command], shell=True)

    # Run bcl2 fastq
    command = [BCL2FASTQ, ' '.join(args.bcl2fastq_options),
               '--sample-sheet', os.path.join(result_path,
                                              args.sample_sheet_name),
               '-R', bcl_path, '-o', output_path]
    log_command(logger, command, shell=True)

    # fix directory structure of the files *before* sync!
    fastqgz_files = glob.glob(os.path.join(output_path, '*fastq.gz'))
    logger.debug('all fastq.gz files\n{}\n\n'.format('\n'.join(fastqgz_files)))

    # TODO(dstone): organize the run based on the TraceGenomics/RunXX/RunXX_YY/*.fastq.gz and do our usual rearrangement
    for fastq_file in fastqgz_files:
        if (args.skip_undetermined
            and os.path.basename(fastq_file).startswith('Undetermined')):
            logger.info("removing {}".format(os.path.basename(fastq_file)))
            os.remove(fastq_file)
        elif args.group_by_sample:
            # exclude the sample number (_S[numbers])
            m = re.match("(.+)(_S\d+_R[12]_001.fastq.gz)",
                         os.path.basename(fastq_file))
            if m:
                sample = m.group(1) # should be of the form RunX_Y
                if not re.match('^Run\d+_\d+$', sample):
                    # shouldn't actually be able to get here, because there is a check above at the sample sheet level,
                    # but just in case
                    raise ValueError('Was expecting to find a sample name of the form RunXX_YY, could not find in {} sample name!'.format(sample))
                run = sample.split('_')[0]
                grouped_sample_path = os.path.join(output_path, run, sample) # organizes as RunX/RunX_Y/[sample stuff]
                if not os.path.exists(grouped_sample_path):
                    logger.debug("creating {}".format(grouped_sample_path))
                    os.makedirs(grouped_sample_path)
                logger.debug("moving {}".format(fastq_file))
                os.rename(fastq_file, os.path.join(grouped_sample_path, os.path.basename(fastq_file)))
            else:
                logger.warning("Warning: regex didn't match {}".format(fastq_file))

    sys.stdout.flush()

    if not args.no_s3_upload:
        # upload fastq files to destination folder
        command = ['aws', 's3', 'sync', '--quiet', output_path,
                   args.s3_output_dir,
                   # this doesn't fit our output structure
                   #os.path.join(args.s3_output_dir, args.exp_id, 'rawdata'),
                   '--exclude', '"*"', '--include', '"*fastq.gz"']
        for i in range(S3_RETRY):
            try:
                log_command(logger, command, shell=True)
                break
            except subprocess.CalledProcessError:
                logger.info("retrying sync fastq")
        else:
            raise RuntimeError("couldn't sync fastqs")


        # check fastq upload
        command = ['aws', 's3', 'ls', '--recursive',
                   args.s3_output_dir]
                   #os.path.join(args.s3_output_dir, args.exp_id, 'rawdata')]
        log_command(logger, command, shell=True)


        # Move reports data back to S3
        reports_path = subprocess.check_output(
                "ls -d {}".format(os.path.join(output_path, 'Reports', 'html', '*',
                                               'all', 'all', 'all')),
                shell=True).rstrip()
        command = ['aws', 's3', 'cp', '--quiet', reports_path,
                   os.path.join(args.s3_report_dir, args.exp_id),
                   '--recursive']
        for i in range(S3_RETRY):
            try:
                log_command(logger, command, shell=True)
                break
            except subprocess.CalledProcessError:
                logger.info("retrying cp reports")
        else:
            raise RuntimeError("couldn't cp reports")

    p.kill()
コード例 #15
0
def main(logger):
    """ Download reference genome, run alignment jobs, and upload results to S3.

        logger - Logger object that exposes the interface the code directly uses
    """

    parser = get_parser()

    args = parser.parse_args()

    args.root_dir = pathlib.Path(args.root_dir)

    if os.environ.get("AWS_BATCH_JOB_ID"):
        args.root_dir = args.root_dir / os.environ["AWS_BATCH_JOB_ID"]

    # local directories
    if args.s3_input_path.endswith("/"):
        args.s3_input_path = args.s3_input_path[:-1]

    sample_id = os.path.basename(args.s3_input_path)
    result_path = args.root_dir / "data" / sample_id
    if args.dobby:
        fastq_path = result_path
    else:
        fastq_path = result_path / "fastqs"
    fastq_path.mkdir(parents=True)

    genome_base_dir = args.root_dir / "genome" / "cellranger"
    genome_base_dir.mkdir(parents=True)

    # check if the input genome and region are valid
    if args.taxon in reference_genomes:
        if args.taxon in deprecated:
            logger.warn(
                f"The name '{args.taxon}' will be removed in the future,"
                f" start using '{deprecated[args.taxon]}'")

        genome_name = reference_genomes[args.taxon]
    else:
        raise ValueError(f"unknown taxon {args.taxon}")

    genome_dir = genome_base_dir / genome_name
    ref_genome_10x_file = f"cellranger/{genome_name}.tgz"

    if args.region != "west" and genome_name not in ("HG38-PLUS", "MM10-PLUS"):
        raise ValueError(f"you must use --region west for {genome_name}")

    if args.region == "east":
        ref_genome_10x_file = f"ref-genome/{ref_genome_10x_file}"

    logger.info(
        f"""Run Info: partition {args.partition_id} out of {args.num_partitions}
                   genome_dir:\t{genome_dir}
         ref_genome_10x_file:\t{ref_genome_10x_file}
                        taxon:\t{args.taxon}
                s3_input_path:\t{args.s3_input_path}""")

    s3 = boto3.resource("s3")

    # download the reference genome data
    logger.info(f"Downloading and extracting genome data {genome_name}")

    s3_object = s3.Object(S3_REFERENCE[args.region], ref_genome_10x_file)

    with tarfile.open(fileobj=s3_object.get()["Body"], mode="r|gz") as tf:
        tf.extractall(path=genome_base_dir)

    sys.stdout.flush()

    # download the fastq files
    command = [
        "aws",
        "s3",
        "cp",
        "--no-progress",
        "--recursive",
        "--force-glacier-transfer" if args.glacier else "",
        args.s3_input_path,
        f"{fastq_path}",
    ]
    log_command(logger, command, shell=True)

    logger.info(
        f"Running partition {args.partition_id} of {args.num_partitions}")

    # check the input folder for existing runs
    sample_name = {
        os.path.basename(fn).rsplit("_", 4)[0]
        for fn in fastq_path.glob("*fastq.gz")
    }
    assert len(sample_name) == 1, "Should only have one sample name to process"
    sample_name = sample_name.pop()

    # Run cellranger
    os.chdir(result_path)
    command = [
        CELLRANGER,
        "count",
        "--localmem=240",
        "--nosecondary",
        "--disable-ui",
        f"--expect-cells={args.cell_count}",
        f"--id={sample_id}",
        f"--fastqs={fastq_path}",
        f"--transcriptome={genome_dir}",
    ]
    if args.dobby:
        command.append(f"--sample={sample_name}")

    failed = log_command(
        logger,
        command,
        shell=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        universal_newlines=True,
    )

    if failed:
        raise RuntimeError("cellranger count failed")

    # Move outs folder to S3
    command = [
        "aws",
        "s3",
        "sync",
        "--no-progress",
        os.path.join(result_path, sample_id, "outs"),
        args.s3_output_path,
    ]
    for i in range(S3_RETRY):
        if not log_command(logger, command, shell=True):
            break
        logger.info(f"retrying sync")
    else:
        raise RuntimeError(f"couldn't sync output")