def download_fastq_files(fastq1_s3_path, fastq2_s3_path, working_dir):
    """Download the fastq files.

    :param fastq1_s3_path: S3 path containing FASTQ with read1
    :param fastq2_s3_path: S3 path containing FASTQ with read2
    :param working_dir: working directory
    :return: local path to the folder containing the fastq
    """

    fastq_folder = os.path.join(working_dir, 'fastq')

    try:
        os.mkdir(fastq_folder)
    except Exception as e:
        pass

    local_fastq1_path = download_file(fastq1_s3_path, fastq_folder)
    local_fastq2_path = download_file(fastq2_s3_path, fastq_folder)

    # Isaac requires the fastqs to be symlinked as lane1_read1.fastq.gz and lane1_read2.fastq.gz
    os.symlink(local_fastq1_path,
               os.path.join(fastq_folder, 'lane1_read1.fastq.gz'))
    os.symlink(local_fastq2_path,
               os.path.join(fastq_folder, 'lane1_read2.fastq.gz'))

    return fastq_folder
Exemple #2
0
def main():
    argparser = ArgumentParser()

    file_path_group = argparser.add_argument_group(title='File paths')
    file_path_group.add_argument('--bam_s3_path',
                                 type=str,
                                 help='BAM s3 path',
                                 required=True)
    file_path_group.add_argument('--bai_s3_path',
                                 type=str,
                                 help='BAM Index s3 path',
                                 required=True)
    file_path_group.add_argument('--vcf_s3_path',
                                 type=str,
                                 help='VCF s3 path',
                                 required=True)
    file_path_group.add_argument('--reference_s3_path',
                                 type=str,
                                 help='Reference file s3 path',
                                 required=True)
    file_path_group.add_argument('--reference_index_s3_path',
                                 type=str,
                                 help='Reference file index s3 path',
                                 required=True)

    run_group = argparser.add_argument_group(title='Run command args')
    run_group.add_argument('--memory',
                           type=str,
                           help='Memory (in GB) for strelka to use',
                           default=28)
    run_group.add_argument('--cmd_args',
                           type=str,
                           help='Additional arguments for platypus',
                           default='')

    argparser.add_argument('--working_dir', type=str, default='/scratch')

    args = argparser.parse_args()

    working_dir = generate_working_dir(args.working_dir)

    print("Downloading bam")
    local_bam_path = download_file(args.bam_s3_path, working_dir)
    local_bai_path = download_file(args.bai_s3_path, working_dir)
    print("BAM and index donwloaded to %s and %s" %
          (local_bam_path, local_bai_path))
    print("Downloading reference")
    local_reference_path = download_file(args.reference_s3_path, working_dir)
    local_reference_index_path = download_file(args.reference_index_s3_path,
                                               working_dir)
    print("Reference downloaded to %s. Index to %s" %
          (local_reference_path, local_reference_index_path))
    print("Running Strelka")
    local_vcf_path = run_strelka(local_bam_path, local_reference_path,
                                 args.memory, args.cmd_args, working_dir)
    print("Uploading %s to %s" % (local_vcf_path, args.vcf_s3_path))
    upload_folder(args.vcf_s3_path, local_vcf_path)
    print('Cleaning up working dir')
    delete_working_dir(working_dir)
    print("Completed")
Exemple #3
0
def main():
    argparser = ArgumentParser()

    argparser.add_argument('--vcf_s3_path',
                           type=str,
                           help='VCF s3 path',
                           required=True)
    argparser.add_argument('--annotated_vcf_s3_path',
                           type=str,
                           help='Annotated vcf s3 path',
                           required=True)
    argparser.add_argument('--working_dir', type=str, default='/scratch')
    argparser.add_argument('--cmd_args',
                           type=str,
                           help='arguments/options for snpeff',
                           default='-t')

    args = argparser.parse_args()

    working_dir = generate_working_dir(args.working_dir)

    print('Downloading vcf')
    local_vcf_path = download_file(args.vcf_s3_path, working_dir)
    print('Running snpeff')
    annotated_vcf_path = run_snpeff(local_vcf_path, args.cmd_args, working_dir)
    print('Uploading %s to %s' %
          (annotated_vcf_path, args.annotated_vcf_s3_path))
    upload_file(args.annotated_vcf_s3_path, annotated_vcf_path)
    print('Cleaning up working dir')
    delete_working_dir(working_dir)
    print('Completed')
def download_required_files(*args):

    fList = []
    for f in args:
        print("Downloading {}".format(f))
        downloaded_path = download_file(f, '/')
        print("file downloaded to {}".format(downloaded_path))
        fList.append(downloaded_path)

    return fList
Exemple #5
0
def main():
    argparser = ArgumentParser()

    file_path_group = argparser.add_argument_group(title='File paths')
    file_path_group.add_argument('--bam_s3_path',
                                 type=str,
                                 help='BAM s3 path',
                                 required=True)
    file_path_group.add_argument('--reference_s3_path',
                                 type=str,
                                 help='reference file',
                                 required=True)
    file_path_group.add_argument('--bam_stats_s3_path',
                                 type=str,
                                 help='S3 Path to upload stats',
                                 required=True)

    run_group = argparser.add_argument_group(title='Run command args')
    run_group.add_argument('--cmd_args',
                           type=str,
                           help='Arguments for platypus',
                           default='')

    argparser.add_argument('--working_dir', type=str, default='/scratch')

    args = argparser.parse_args()

    working_dir = generate_working_dir(args.working_dir)

    print("Downloading bam")
    local_bam_path = download_file(args.bam_s3_path, working_dir)
    print("BAM downloaded to %s" % local_bam_path)
    print("Downloading reference")
    local_reference_path = download_file(args.reference_s3_path, working_dir)
    print("Reference downloaded to %s." % local_reference_path)
    print("Running samtools stats")
    local_stats_path = run_samtools_stats(local_bam_path, local_reference_path,
                                          args.cmd_args, working_dir)
    print("Uploading %s to %s" % (local_stats_path, args.bam_stats_s3_path))
    upload_file(args.bam_stats_s3_path, local_stats_path)
    print('Cleaning up working dir')
    delete_working_dir(working_dir)
    print("Completed")
Exemple #6
0
def download_fastq_file(fastq1_s3_path, working_dir):
    """
    Downlodas the fastq files
    :param fastq1_s3_path: S3 path containing FASTQ with read1
    :param fastq2_s3_path: S3 path containing FASTQ with read2
    :param working_dir: working directory
    :return: local path to the folder containing the fastq
    """
    fastq_folder = os.path.join(working_dir, 'fastq')

    try:
        os.mkdir(fastq_folder)
    except Exception as e:
        pass

    local_fastq_path = download_file(fastq1_s3_path, fastq_folder)

    return local_fastq_path
Exemple #7
0
def download_fastq_file(fastq1_s3_path, working_dir):
    """
    Downloads the input file
    :param fastq1_s3_path: S3 path containing our FASTQ file
    :param working_dir: working directory
    :return: local path to the folder containing the fastq
    """
    fastq_folder = os.path.join(working_dir, 'fastq')

    try:
        os.mkdir(fastq_folder)
    except Exception as e:
        print('Error occured while creating the fastq download folder')
        pass

    local_fastq_path = download_file(fastq1_s3_path, fastq_folder)

    return local_fastq_path
Exemple #8
0
def main():
    args = parseArguments()
    logging.basicConfig(level=args.log_level)
    logger.info("Run cohort-matcher Docker CLI v%s", __version__)
    logger.info(args)

    working_dir = generate_working_dir(args.working_dir)

    # Download fastq files and reference files
    logger.info('Downloading bam sheets')
    set1_bamsheet = download_file(args.set1_s3_path, working_dir)
    set2_bamsheet = download_file(args.set2_s3_path, working_dir)

    # Download reference bundles
    if args.set1_reference == 'hg19' or args.set2_reference == 'hg19':
        logger.info("Downloading hg19 reference bundle")
        download_file('s3://bmsrd-ngs-repo/reference/hg19-cohort-matcher.tar.bz2', working_dir)
        logger.info("Uncompressing hg19 reference bundle")
        uncompress(os.path.join(working_dir, 'hg19-cohort-matcher.tar.bz2'), working_dir)
    if args.set2_reference == 'GRCh37' or args.set2_reference == 'GRCh37':
        logger.info("Downloading GRCh37 reference bundle")
        download_file('s3://bmsrd-ngs-repo/reference/GRCh37-cohort-matcher.tar.bz2', working_dir)
        logger.info("Uncompressing GRCh37 reference bundle")
        uncompress(os.path.join(working_dir, 'GRCh37-cohort-matcher.tar.bz2', working_dir))

    # Run cohort-matcher
    logger.info('Running cohort-matcher')
    if args.max_jobs is None:
        max_jobs = multiprocessing.cpu_count()
    else:
        max_jobs = args.max_jobs
    output_folder_path = run_cohort_matcher(args.log_level, set1_bamsheet, set2_bamsheet,
                                            args.set1_reference, args.set2_reference,
                                            working_dir, args.output_prefix, max_jobs)
    logger.info('Uploading results to %s', args.s3_output_folder_path)
    #upload_bam(args.bam_s3_folder_path, bam_folder_path)
    logger.info('Cleaning up working dir')
    delete_working_dir(working_dir)
    logger.info('Completed')
Exemple #9
0
def run_vcf2tiledb_no_s3(workdir,idx, loader_path, callset_path, vid_path, contig):
    """
    fallback to downloading each file. This method uses
    tabix to read the intervals from each file and saves sliced vcf
    as a local file.
    """
    print("Performing no-s3-callset vcf2tiledb")
    exportSession()

    with open(vid_path) as vid_file:
        hg = json.load(vid_file)

    offset = hg['contigs'][contig]['tiledb_column_offset']

    # edit loader file to point to correct callset filename
    srchStr = subprocess.check_output('grep -Po "/callset.+.json" %s' % (loader_path), shell=True)
    subprocess.check_call('sed -i "s|%s|%s|" %s' % (srchStr.rstrip(), callset_path, loader_path), shell=True)

    # extract start/end for this partition
    with open(loader_path) as loader_file:
        ldr = json.load(loader_file)

    start = ldr['column_partitions'][idx]['begin'] - offset - 1
    end   = ldr['column_partitions'][idx]['end']   - offset + 1
    del ldr

    if start < 0: start = 0

    pos = "%s:%s-%s" % (contig, start, end) # tabix region

    # Download VCF slices to workdir
    # assumes callset has S3 paths for filenames
    with open(callset_path) as callset_fp:
        fList = json.load(callset_fp)
        fListNew = fList

    print("Downloading slices")
    for SM in fList['callsets']:
        #download_file(fList['callsets'][SM]['filename'], workdir)
        s3path = fList['callsets'][SM]['filename']
        fName = os.path.basename(s3path)
        gzFile = '%s/%s' % (workdir, fName)

        retries = 0
        while True:
          if retries > 0: print("Retrying download for {}".format(fName))

          if retries > 3:
              print('Downloading entire file')
              download_file(s3path, workdir)
              break

          cmd = "/bin/bash -o pipefail -c 'timeout 30 tabix -h %s %s | bgzip > %s'" % (s3path, pos, gzFile)

          try:
             subprocess.check_call(cmd, shell=True)

             if not os.path.exists(gzFile)) or os.stat(gzFile).st_size < 29:
               print("Download file size error")
               retries += 1
             else:
               break

          except subprocess.CalledProcessError as e:
             print("Caught Exception: CalledProcessError")
             status = e.returncode
             print("status:={}".format(status))
             retries += 1
Exemple #10
0
def main():
    argparser = ArgumentParser()

    file_path_group = argparser.add_argument_group(title='File paths')
    file_path_group.add_argument('--vcf_s3_path',
                                 type=str,
                                 help='VCF s3 path',
                                 required=True)
    file_path_group.add_argument('--bam_s3_path',
                                 type=str,
                                 help='BAM s3 path',
                                 required=True)
    file_path_group.add_argument('--bai_s3_path',
                                 type=str,
                                 help='BAI s3 path',
                                 required=True)
    file_path_group.add_argument('--results_s3_path',
                                 type=str,
                                 help='S3 Path to upload stats',
                                 required=True)

    run_group = argparser.add_argument_group(title='Run command args')
    run_group.add_argument('--cmd_args',
                           type=str,
                           help='Additional Arguments',
                           default=None,
                           nargs='*',
                           action='store',
                           dest='opt_list')

    #argparser.add_argument('--working_dir', type=str, default='/scratch')

    args = argparser.parse_args()

    total_size = 0
    for obj in [args.vcf_s3_path, args.bam_s3_path, args.bai_s3_path]:
        total_size += get_size(obj)

    # add more for formatting loss
    total_size += 2e9
    total_size = int(total_size)

    print("Total Size := {0}".format(total_size))

    # Declare expected disk usage, triggers host's EBS script (ecs-ebs-manager)
    with open("/TOTAL_SIZE", "w") as text_file:
        text_file.write("{0}".format(total_size))

    print("Waiting EBS")

    # Wait for EBS to appear
    while not os.path.isdir('/scratch'):
        time.sleep(5)

    # Wait for mount verification
    while not os.path.ismount('/scratch'):
        time.sleep(1)

    working_dir = generate_working_dir('/scratch')

    print("Downloading vcf")
    local_vcf_path = download_file(args.vcf_s3_path, working_dir)
    print("VCF downloaded to %s" % local_vcf_path)

    print("Downloading bam")
    local_bam_path = download_file(args.bam_s3_path, working_dir)
    print("BAM downloaded to %s" % local_bam_path)

    print("Downloading bam index")
    local_bam_index_path = download_file(args.bai_s3_path, working_dir)
    print("BAM index downloaded to %s" % local_bam_index_path)

    print("Running verifybamid")
    local_stats_path = run_verifybamid_basic(local_vcf_path, local_bam_path,
                                             local_bam_index_path,
                                             args.opt_list, working_dir)

    for ext in ['.selfSM', '.bestSM', '.depthSM', '.log']:
        if os.path.exists(local_stats_path + ext):
            print("Uploading %s to %s" %
                  (local_stats_path + ext, args.results_s3_path + ext))
            upload_file(args.results_s3_path + ext, local_stats_path + ext)

    print('Cleaning up working dir')
    delete_working_dir(working_dir)

    print("Completed")