def upload_results(sample_name, taxon, dest_dir, s3_output_path, logger):
    t_config = TransferConfig(use_threads=False)

    s3_output_bucket, s3_output_prefix = s3u.s3_bucket_and_key(s3_output_path)

    src_files = [
        os.path.join(dest_dir, "results", "htseq-count.txt"),
        os.path.join(dest_dir, "results", "Pass1", "Log.final.out"),
        os.path.join(dest_dir, "results", "Pass1", "SJ.out.tab"),
        os.path.join(dest_dir, "results", "Pass1", "Aligned.out.sorted.bam"),
        os.path.join(dest_dir, "results", "Pass1",
                     "Aligned.out.sorted.bam.bai"),
    ]

    dest_names = [
        "{}.{}.htseq-count.txt".format(sample_name, taxon),
        "{}.{}.log.final.out".format(sample_name, taxon),
        "{}.{}.SJ.out.tab".format(sample_name, taxon),
        "{}.{}.Aligned.out.sorted.bam".format(sample_name, taxon),
        "{}.{}.Aligned.out.sorted.bam.bai".format(sample_name, taxon),
    ]

    for src_file, dest_name in zip(src_files, dest_names):
        logger.info("Uploading {}".format(dest_name))
        s3c.upload_file(
            Filename=src_file,
            Bucket=s3_output_bucket,
            Key=os.path.join(s3_output_prefix, dest_name),
            Config=t_config,
        )
Beispiel #2
0
def upload_results(sample_name, taxon, dest_dir, s3_output_path, logger):
    """ Upload alignment results copied from EC2 machine directory onto S3.

        sample_name - Sequenced sample name (joined by "_")
        taxon - Reference genome name
        dest_dir - Path local to the machine on EC2 under which alignment results
                   are stored before uploaded to S3. Child path of run_dir/sample_name
        s3_output_path - S3 path of where the alignment results are stored
        logger - Logger object that exposes the interface the code directly uses
    """

    t_config = TransferConfig(use_threads=False)

    s3_output_bucket, s3_output_prefix = s3u.s3_bucket_and_key(s3_output_path)

    src_files = [
        os.path.join(dest_dir, "results", "htseq-count.txt"),
        os.path.join(dest_dir, "results", "Pass1", "Log.final.out"),
        os.path.join(dest_dir, "results", "Pass1", "SJ.out.tab"),
        os.path.join(dest_dir, "results", "Pass1", "Aligned.out.sorted.bam"),
        os.path.join(dest_dir, "results", "Pass1",
                     "Aligned.out.sorted.bam.bai"),
    ]

    dest_names = [
        "{}.{}.htseq-count.txt".format(sample_name, taxon),
        "{}.{}.log.final.out".format(sample_name, taxon),
        "{}.{}.SJ.out.tab".format(sample_name, taxon),
        "{}.{}.Aligned.out.sorted.bam".format(sample_name, taxon),
        "{}.{}.Aligned.out.sorted.bam.bai".format(sample_name, taxon),
    ]

    for src_file, dest_name in zip(src_files, dest_names):
        logger.info("Uploading {}".format(dest_name))
        s3c.upload_file(
            Filename=src_file,
            Bucket=s3_output_bucket,
            Key=os.path.join(s3_output_prefix, dest_name),
            Config=t_config,
        )
Beispiel #3
0
def main(logger):
    """ Download reference genome, run alignment jobs, and upload results to S3.

        logger - Logger object that exposes the interface the code directly uses
    """

    parser = get_parser()

    args = parser.parse_args()

    if os.environ.get("AWS_BATCH_JOB_ID"):
        root_dir = os.path.join("/mnt", os.environ["AWS_BATCH_JOB_ID"])
    else:
        root_dir = "/mnt"

    # local directories
    if args.s3_input_path.endswith("/"):
        args.s3_input_path = args.s3_input_path[:-1]

    run_dir = os.path.join(root_dir, "data")
    os.makedirs(run_dir)

    # check if the input genome and region are valid
    if args.taxon in reference_genomes:
        if args.taxon in deprecated:
            logger.warn(
                f"The name '{args.taxon}' will be removed in the future,"
                f" start using '{deprecated[args.taxon]}'")

        genome_name = reference_genomes[args.taxon]
    else:
        raise ValueError(f"unknown taxon {args.taxon}")

    if args.taxon == "gencode.vM19" or args.taxon == "gencode.vM19.ERCC":
        id_attr = "gene_name"
    else:
        id_attr = "gene_id"

    genome_dir = os.path.join(root_dir, "genome", "STAR", genome_name)
    ref_genome_star_file = f"STAR/{genome_name}.tgz"
    sjdb_gtf = os.path.join(root_dir, f"{genome_name}.gtf")

    if args.region != "west" and genome_name not in ("HG38-PLUS", "MM10-PLUS"):
        raise ValueError(f"you must use --region west for {genome_name}")

    if args.region == "east":
        ref_genome_star_file = os.path.join("ref-genome", ref_genome_star_file)

    s3_input_bucket, s3_input_prefix = s3u.s3_bucket_and_key(
        args.s3_input_path)

    logger.info(
        f"""Run Info: partition {args.partition_id} out of {args.num_partitions}
                   genome_dir:\t{genome_dir}
         ref_genome_star_file:\t{ref_genome_star_file}
                     sjdb_gtf:\t{sjdb_gtf}
                      id_attr:\t{id_attr}
                        taxon:\t{args.taxon}
                s3_input_path:\t{args.s3_input_path}""")

    s3 = boto3.resource("s3")

    # download the reference genome data
    os.mkdir(os.path.join(root_dir, "genome"))
    logger.info("Downloading and extracting gtf data {}".format(sjdb_gtf))

    s3c.download_file(
        Bucket=S3_REFERENCE[
            "west"],  # just always download this from us-west-2...
        Key=f"velocyto/{genome_name}.gtf",
        Filename=sjdb_gtf,
    )

    os.mkdir(os.path.join(root_dir, "genome", "STAR"))
    logger.info(
        "Downloading and extracting STAR data {}".format(ref_genome_star_file))

    s3_object = s3.Object(S3_REFERENCE[args.region], ref_genome_star_file)

    with tarfile.open(fileobj=s3_object.get()["Body"], mode="r|gz") as tf:
        tf.extractall(path=os.path.join(root_dir, "genome", "STAR"))

    # Load Genome Into Memory
    command = [STAR, "--genomeDir", genome_dir, "--genomeLoad", "LoadAndExit"]
    if ut_log.log_command(logger,
                          command,
                          stdout=subprocess.PIPE,
                          stderr=subprocess.STDOUT,
                          shell=True):
        raise RuntimeError("Failed to load genome into memory")

    sample_re = re.compile("([^/]+)_R\d(?:_\d+)?.fastq.gz$")
    s3_output_bucket, s3_output_prefix = s3u.s3_bucket_and_key(
        args.s3_output_path)

    logger.info("Running partition {} of {}".format(args.partition_id,
                                                    args.num_partitions))

    # Check the input folder for existing runs
    if not args.force_realign:
        output = s3u.prefix_gen(s3_output_bucket, s3_output_prefix, lambda r:
                                (r["LastModified"], r["Key"]))
    else:
        output = []

    output_files = {
        tuple(os.path.basename(fn).rsplit(".", 2)[0].split(".", 1)[:2])
        for dt, fn in output
        if fn.endswith(".htseq-count.txt") and dt > CURR_MIN_VER
    }

    logger.info("Skipping {} existing results".format(len(output_files)))

    sample_files = [(fn, s)
                    for fn, s in s3u.get_size(s3_input_bucket, s3_input_prefix)
                    if fn.endswith("fastq.gz")]

    sample_lists = defaultdict(list)
    sample_sizes = defaultdict(list)

    for fn, s in sample_files:
        matched = sample_re.search(os.path.basename(fn))
        if matched:
            sample_lists[matched.group(1)].append(fn)
            sample_sizes[matched.group(1)].append(s)

    logger.info(f"number of samples: {len(sample_lists)}")

    for sample_name in sorted(
            sample_lists)[args.partition_id::args.num_partitions]:
        if (sample_name, args.taxon) in output_files:
            logger.debug(f"{sample_name} already exists, skipping")
            continue

        if sum(sample_sizes[sample_name]) < args.min_size:
            logger.info(f"{sample_name} is below min_size, skipping")
            continue

        failed, dest_dir = run_sample(
            s3_input_bucket,
            sample_name,
            sorted(sample_lists[sample_name]),
            genome_dir,
            run_dir,
            args.star_proc,
            logger,
        )

        failed = failed or run_htseq(dest_dir, sjdb_gtf, id_attr, logger)

        if not failed:
            upload_results(sample_name, args.taxon, dest_dir,
                           args.s3_output_path, logger)

        command = ["rm", "-rf", dest_dir]
        ut_log.log_command(logger, command, shell=True)

        time.sleep(30)

    logger.info("Job completed")
def gene_cell_table(args, logger, dryrun):
    logger.info("Starting")

    h5ad_out = False

    if args.output_file.endswith(".txt"):
        sep = "\t"
    elif args.output_file.endswith(".csv"):
        sep = ","
    elif args.output_file.endswith(".h5ad"):
        try:
            import anndata
            import pandas as pd
            import scipy.sparse as sp
        except ImportError:
            raise ImportError(
                "Please install the anndata package for h5ad output\n"
                "    conda install -c bioconda anndata"
            )

        h5ad_out = True
    else:
        raise ValueError(
            "Unfamiliar file format {}".format(os.path.splitext(args.output_file)[1])
        )

    logger.info("Starting S3 client")
    client = boto3.client("s3")
    paginator = client.get_paginator("list_objects")

    htseq_files = []
    log_files = []

    s3_input_bucket, s3_input_prefix = s3_bucket_and_key(args.s3_input_path)

    logger.info("Getting htseq file list")
    response_iterator = paginator.paginate(
        Bucket=s3_input_bucket, Prefix=s3_input_prefix
    )
    for result in response_iterator:
        if "Contents" in result:
            htseq_files.extend(
                r["Key"]
                for r in result["Contents"]
                if r["Key"].endswith("htseq-count.txt")
            )
            if not args.no_log:
                log_files.extend(
                    r["Key"]
                    for r in result["Contents"]
                    if r["Key"].endswith("log.final.out")
                )
    logger.info("{} htseq files found".format(len(htseq_files)))

    sample_names = tuple(os.path.basename(fn)[:-16] for fn in htseq_files)

    gene_lists = set()
    gene_counts = list()

    for htseq_file in htseq_files:
        logger.debug("Downloading {}".format(htseq_file))
        if not dryrun:
            gene_list, gene_count = get_htseq_counts(
                client, s3_input_bucket, htseq_file
            )
            gene_lists.add(gene_list)
            gene_counts.append(gene_count)

    logger.info("Downloaded {} files".format(len(htseq_files)))
    if not dryrun:
        assert len(gene_lists) == 1
        gene_list = gene_lists.pop()

        logger.info("Writing to {}".format(args.output_file))

        if h5ad_out:
            gene_cell_counts = sp.vstack(
                [sp.csr_matrix(list(map(int, gc))) for gc in gene_counts]
            )

            adata = anndata.AnnData(
                gene_cell_counts,
                obs=pd.DataFrame(index=sample_names),
                var=pd.DataFrame(index=gene_list),
            )

            adata.write_h5ad(args.output_file)
        else:
            with open(args.output_file, "w") as OUT:
                wtr = csv.writer(OUT, delimiter=sep)
                wtr.writerow(("gene",) + sample_names)
                for i, g in enumerate(gene_list):
                    wtr.writerow((g,) + tuple(gc[i] for gc in gene_counts))

    if args.no_log:
        logger.info("Done!")
        return

    log_metrics = set()
    log_values = list()

    for log_file in log_files:
        logger.debug("Downloading {}".format(log_file))
        if not dryrun:
            metric_names, values = get_log_file(client, s3_input_bucket, log_file)
            log_metrics.add(metric_names)
            log_values.append(values)

    logger.info("Downloaded {} files".format(len(log_files)))
    if not dryrun:
        assert len(log_metrics) == 1
        log_metrics = log_metrics.pop()

    log_file = ".log".join(os.path.splitext(args.output_file))

    logger.info("Writing to {}".format(log_file))
    if not dryrun:
        with open(log_file, "w") as OUT:
            wtr = csv.writer(OUT, delimiter=sep)
            wtr.writerow(("metric",) + sample_names)
            for i, m in enumerate(log_metrics):
                wtr.writerow((m,) + tuple(mv[i] for mv in log_values))

    logger.info("Done!")
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser(
        description="Create a shell to run alignment jobs"
        " with 10x for multiple samples all together"
    )
    # required arguments
    requiredNamed = parser.add_argument_group("required arguments")

    requiredNamed.add_argument(
        "--taxon",
        choices=list(reference_genomes.keys()),
        required=True,
        help="Reference genome for the alignment run, "
        "selected from the reference_genomes dictionary keys from "
        "alignment.run_10x_count.py",
    )

    requiredNamed.add_argument(
        "--s3_input_path",
        required=True,
        help="The folder containing sample folders, "
        "each of which have fastq.gz files to align",
    )

    requiredNamed.add_argument(
        "--s3_output_path",
        required=True,
        help="The folder to store the alignment results",
    )

    # optional arguments
    parser.add_argument(
        "--branch", default="master", help="Branch of utilities repo to use"
    )

    parser.add_argument(
        "script_args",
        nargs=argparse.REMAINDER,
        help="Extra arguments are passed to run_10x_count",
    )

    args = parser.parse_args()

    # check if the input genome is valid
    if args.taxon in reference_genomes:
        if args.taxon in deprecated:
            warnings.warn(
                f"The name '{args.taxon}' will be removed in the future,"
                f" start using '{deprecated[args.taxon]}'"
            )
    else:
        raise ValueError(f"unknown taxon {args.taxon}")

    # get the list of sample folder paths under the input folder
    s3_input_bucket, s3_input_prefix = s3u.s3_bucket_and_key(args.s3_input_path)
    s3_input_prefix += "/"
    sample_folder_paths = [
        folder_path for folder_path in s3u.get_folders(s3_input_bucket, s3_input_prefix)
    ]
    complete_input_paths = [
        "s3://" + s3_input_bucket + "/" + path for path in sample_folder_paths
    ]

    # print input arguments of running alignment.run_10x_count for each sample folder
    num_partitions = len(complete_input_paths)
    for i in range(num_partitions):
        s3_input_path = complete_input_paths[i]
        s3_output_path = posixpath.join(
            args.s3_output_path, s3_input_path.rsplit("/", 2)[1]
        )
        print(
            " ".join(
                (
                    "evros",
                    f"--branch {args.branch}",
                    "alignment.run_10x_count",
                    f"--taxon {args.taxon}",
                    f"--num_partitions {num_partitions}",
                    f"--partition_id {i}",
                    f"--s3_input_path {s3_input_path}",
                    f"--s3_output_path {s3_output_path}",
                    " ".join(args.script_args),
                )
            )
        )
        print("sleep 10")
Beispiel #6
0
def main(logger):
    parser = get_parser()

    args = parser.parse_args()

    if os.environ.get('AWS_BATCH_JOB_ID'):
        root_dir = os.path.join('/mnt', os.environ['AWS_BATCH_JOB_ID'])
    else:
        root_dir = '/mnt'

    run_dir = os.path.join(root_dir, 'data', 'hca')
    os.makedirs(run_dir)

    if args.taxon == 'h**o':
        genome_dir = os.path.join(root_dir, "genome/STAR/HG38-PLUS/")
        ref_genome_file = 'hg38-plus.tgz'
        ref_genome_star_file = 'STAR/HG38-PLUS.tgz'
        sjdb_gtf = os.path.join(root_dir, 'genome', 'hg38-plus', 'hg38-plus.gtf')
    elif args.taxon == 'mus':
        genome_dir = os.path.join(root_dir, "genome/STAR/MM10-PLUS/")
        ref_genome_file = 'mm10-plus.tgz'
        ref_genome_star_file = 'STAR/MM10-PLUS.tgz'
        sjdb_gtf = os.path.join(root_dir, 'genome', 'mm10-plus', 'mm10-plus.gtf')

    else:
        raise ValueError('Invalid taxon {}'.format(args.taxon))

    if args.star_proc > mp.cpu_count():
        raise ValueError('Not enough CPUs to give {} processes to STAR'.format(
                args.star_proc))

    s3_input_bucket,s3_input_prefix = s3u.s3_bucket_and_key(args.s3_input_path)

    logger.info(
            '''Run Info: partition {} out of {}
                   star_proc:\t{}
                  htseq_proc:\t{}
                  genome_dir:\t{}
             ref_genome_file:\t{}
        ref_genome_star_file:\t{}
                    sjdb_gtf:\t{}
                       taxon:\t{}
               s3_input_path:\t{}
                  input_dirs:\t{}'''.format(
                    args.partition_id, args.num_partitions,
                    args.star_proc, args.htseq_proc,
                    genome_dir, ref_genome_file,
                    ref_genome_star_file, sjdb_gtf,
                    args.taxon, args.s3_input_path,
                    ', '.join(args.input_dirs)
            )
    )


    s3 = boto3.resource('s3')

    # download the genome data
    os.mkdir(os.path.join(root_dir, 'genome'))
    logger.info('Downloading and extracting genome data {}'.format(ref_genome_file))

    object = s3.Object('czbiohub-reference', ref_genome_file)

    with tarfile.open(fileobj=object.get()['Body'], mode='r:gz') as tf:
        tf.extractall(path=os.path.join(root_dir, 'genome'))


    # download STAR stuff
    os.mkdir(os.path.join(root_dir, 'genome', 'STAR'))
    logger.info('Downloading and extracting STAR data {}'.format(ref_genome_star_file))

    object = s3.Object('czbiohub-reference', ref_genome_star_file)

    with tarfile.open(fileobj=object.get()['Body'], mode='r:gz') as tf:
        tf.extractall(path=os.path.join(root_dir, 'genome', 'STAR'))


    # Load Genome Into Memory
    command = [STAR, '--genomeDir', genome_dir, '--genomeLoad', 'LoadAndExit']
    ut_log.log_command(logger, command, shell=True)

    log_queue, log_thread = ut_log.get_thread_logger(logger)

    star_queue = mp.Queue()
    htseq_queue = mp.Queue()

    n_star_procs = mp.cpu_count() // args.star_proc

    star_args = (star_queue, htseq_queue, log_queue, s3_input_bucket,
                 genome_dir, run_dir, args.star_proc)
    star_procs = [mp.Process(target=run_sample, args=star_args)
                  for i in range(n_star_procs)]

    for p in star_procs:
        p.start()

    htseq_args = (htseq_queue, log_queue,
                  args.s3_input_path, args.s3_output_path,
                  args.taxon, sjdb_gtf)
    htseq_procs = [mp.Process(target=run_htseq, args=htseq_args)
                   for i in range(args.htseq_proc)]

    for p in htseq_procs:
        p.start()


    sample_re = re.compile("([^/]+)_R\d_\d+.fastq.gz$")

    for input_dir in args.input_dirs:
        if args.s3_output_path is None:
            s3_output_path = os.path.join(args.s3_input_path, input_dir, 'results')
        else:
            s3_output_path = args.s3_output_path

        s3_output_bucket,s3_output_prefix = s3u.s3_bucket_and_key(s3_output_path)

        # Check the input_dir folder for existing runs
        if not args.force_realign:
            output = s3u.prefix_gen(s3_output_bucket, s3_output_prefix,
                                    lambda r: (r['LastModified'], r['Key']))
        else:
            output = []

        output_files = {tuple(os.path.basename(fn).split('.')[:2]) for dt,fn in output
                        if fn.endswith('htseq-count.txt') and dt > CURR_MIN_VER}

        logger.info("Skipping {} existing results".format(len(output_files)))

        logger.info("Running partition {} of {} for {}".format(
                args.partition_id, args.num_partitions, input_dir)
        )

        output = [
            fn for fn in s3u.get_files(s3_input_bucket,
                                       os.path.join(s3_input_prefix, input_dir))
            if fn.endswith('fastq.gz')
        ]

        logger.info("number of fastq.gz files: {}".format(len(output)))

        sample_lists = defaultdict(list)

        for fn in output:
            matched = sample_re.search(os.path.basename(fn))
            if matched:
                sample_lists[matched.group(1)].append(fn)

        for sample_name in sorted(sample_lists)[args.partition_id::args.num_partitions]:
            if (sample_name, args.taxon) in output_files:
                logger.info("{} already exists, skipping".format(sample_name))
                continue

            logger.info("Adding sample {} to queue".format(sample_name))
            star_queue.put((input_dir, sample_name, sorted(sample_lists[sample_name])))

    for i in range(n_star_procs):
        star_queue.put('STOP')

    for p in star_procs:
        p.join()

    for i in range(args.htseq_proc):
        htseq_queue.put('STOP')

    for p in htseq_procs:
        p.join()

    log_queue.put('STOP')
    log_thread.join()

    # Remove Genome from Memory
    command = [STAR, '--genomeDir', genome_dir, '--genomeLoad', 'Remove']
    ut_log.log_command(logger, command, shell=True)

    logger.info('Job completed')
Beispiel #7
0
def run_htseq(htseq_queue, log_queue, s3_input_path, s3_output_path, taxon, sjdb_gtf):
    s3c = boto3.client('s3')

    for input_dir, sample_name, dest_dir in iter(htseq_queue.get, 'STOP'):
        # running htseq
        command = [HTSEQ,
                   '-r', 'name', '-s', 'no', '-f', 'bam',
                   '-m', 'intersection-nonempty',
                   os.path.join(dest_dir, 'results', 'Pass1',
                                'Aligned.out.sorted-byname.bam'),
                   sjdb_gtf, '>', 'htseq-count.txt']
        failed = ut_log.log_command_to_queue(
            log_queue, command, shell=True, cwd=os.path.join(dest_dir, 'results')
        )
        if failed:
            command = ['rm', '-rf', dest_dir]
            ut_log.log_command_to_queue(log_queue, command, shell=True)
            continue

        os.remove(os.path.join(dest_dir, 'results', 'Pass1',
                               'Aligned.out.sorted-byname.bam'))

        # compress the results dir and move it to s3
        command = ['tar', '-cvzf',
                   '{}.{}.tgz'.format(sample_name, taxon),
                   'results']
        ut_log.log_command_to_queue(log_queue, command, shell=True, cwd=dest_dir)

        if s3_output_path is None:
            s3_output_path = os.path.join(s3_input_path, input_dir, 'results')

        s3_output_bucket,s3_output_prefix = s3u.s3_bucket_and_key(s3_output_path)

        src_files = [
            os.path.join(dest_dir, '{}.{}.tgz'.format(sample_name, taxon)),
            os.path.join(dest_dir, 'results', 'htseq-count.txt'),
            os.path.join(dest_dir, 'results', 'Pass1', 'Log.final.out'),
            os.path.join(dest_dir, 'results', 'Pass1', 'SJ.out.tab'),
            os.path.join(dest_dir, 'results', 'Pass1',
                         'Aligned.out.sorted.bam'),
            os.path.join(dest_dir, 'results', 'Pass1',
                         'Aligned.out.sorted.bam.bai')
        ]

        dest_names = [
            '{}.{}.tgz'.format(sample_name, taxon),
            '{}.{}.htseq-count.txt'.format(sample_name, taxon),
            '{}.{}.log.final.out'.format(sample_name, taxon),
            '{}.{}.SJ.out.tab'.format(sample_name, taxon),
            '{}.{}.Aligned.out.sorted.bam'.format(sample_name, taxon),
            '{}.{}.Aligned.out.sorted.bam.bai'.format(sample_name, taxon)
        ]

        for src_file,dest_name in zip(src_files, dest_names):
            log_queue.put(('Uploading {}'.format(dest_name), logging.INFO))
            s3c.upload_file(Filename=src_file, Bucket=s3_output_bucket,
                            Key=os.path.join(s3_output_prefix, dest_name))

        # rm all the files
        command = ['rm', '-rf', dest_dir]
        ut_log.log_command_to_queue(log_queue, command, shell=True)
Beispiel #8
0
def main(logger):
    parser = get_parser()

    args = parser.parse_args()

    if os.environ.get("AWS_BATCH_JOB_ID"):
        root_dir = os.path.join("/mnt", os.environ["AWS_BATCH_JOB_ID"])
    else:
        root_dir = "/mnt"

    run_dir = os.path.join(root_dir, "data")
    os.makedirs(run_dir)

    os.mkdir(os.path.join(run_dir, "reference"))
    os.mkdir(os.path.join(run_dir, "input"))

    if args.taxon == "h**o":
        gtf_file = "HG38-PLUS.gtf"
        mask_file = "hg38_rmsk.gtf"
    elif args.taxon == "mus":
        gtf_file = "MM10-PLUS.gtf"
        mask_file = "mm10_rmsk.gtf"
    else:
        raise ValueError("Invalid taxon {}".format(args.taxon))

    s3_input_bucket, s3_input_prefix = s3u.s3_bucket_and_key(
        args.s3_input_path)

    logger.info(
        f"""Run Info: partition {args.partition_id} out of {args.num_partitions}
                     gtf_file:\t{gtf_file}
                    mask_file:\t{mask_file}
                        taxon:\t{args.taxon}
                s3_input_path:\t{args.s3_input_path}
                   input_dirs:\t{', '.join(args.input_dirs)}""")

    gtf_path = os.path.join(run_dir, "reference", gtf_file)
    mask_path = os.path.join(run_dir, "reference", mask_file)
    s3u.download_files(
        [f"velocyto/{gtf_file}", f"velocyto/{mask_file}"],
        [gtf_path, mask_path],
        b="czbiohub-reference",
        n_proc=2,
    )

    sample_re = re.compile(f"([^/]+).{args.taxon}.Aligned.out.sorted.bam$")
    plate_set = set(args.plates)

    s3_output_bucket, s3_output_prefix = s3u.s3_bucket_and_key(
        args.s3_output_path)

    for input_dir in args.input_dirs:
        logger.info("Running partition {} of {} for {}".format(
            args.partition_id, args.num_partitions, input_dir))

        # Check the output folder for existing runs
        if not args.force_redo:
            output = s3u.prefix_gen(
                s3_output_bucket,
                s3_output_prefix,
                lambda r: (r["LastModified"], r["Key"]),
            )
        else:
            output = []

        output_files = {
            os.path.basename(fn).split(".")[0]
            for dt, fn in output if fn.endswith(".loom") and dt > CURR_MIN_VER
        }

        sample_files = [
            fn for fn in s3u.get_files(
                s3_input_bucket, os.path.join(s3_input_prefix, input_dir))
            if fn.endswith(f"{args.taxon}.Aligned.out.sorted.bam")
        ]

        plate_samples = []

        for fn in sample_files:
            matched = sample_re.search(os.path.basename(fn))
            if matched.group(1) not in output_files:
                if len(plate_set) == 0 or matched.group(1).split(
                        "_")[1] in plate_set:
                    plate_samples.append(fn)

        logger.info(f"number of bam files: {len(plate_samples)}")

        for sample_name in sorted(
                plate_samples)[args.partition_id::args.num_partitions]:
            run_sample(
                sample_name,
                mask_path,
                gtf_path,
                s3_input_bucket,
                s3_output_bucket,
                os.path.join(s3_output_prefix, input_dir),
                run_dir,
                logger,
            )
            time.sleep(30)

    logger.info("Job completed")