def upload_data(job, master_ip, inputs, hdfs_name, upload_name, spark_on_toil):
    """
    Upload file hdfsName from hdfs to s3
    """

    if mock_mode():
        truncate_file(master_ip, hdfs_name, spark_on_toil)

    log.info("Uploading output BAM %s to %s.", hdfs_name, upload_name)
    call_conductor(job, master_ip, hdfs_name, upload_name, memory=inputs.memory)
    remove_file(master_ip, hdfs_name, spark_on_toil)
def download_data(job, master_ip, inputs, known_snps, bam, hdfs_snps, hdfs_bam):
    """
    Downloads input data files from S3.

    :type masterIP: MasterAddress
    """

    log.info("Downloading known sites file %s to %s.", known_snps, hdfs_snps)
    call_conductor(job, master_ip, known_snps, hdfs_snps, memory=inputs.memory)

    log.info("Downloading input BAM %s to %s.", bam, hdfs_bam)
    call_conductor(job, master_ip, bam, hdfs_bam, memory=inputs.memory)
def download_data(job, master_ip, inputs, known_snps, bam, hdfs_snps,
                  hdfs_bam):
    """
    Downloads input data files from S3.

    :type masterIP: MasterAddress
    """

    log.info("Downloading known sites file %s to %s.", known_snps, hdfs_snps)
    call_conductor(job, master_ip, known_snps, hdfs_snps, memory=inputs.memory)

    log.info("Downloading input BAM %s to %s.", bam, hdfs_bam)
    call_conductor(job, master_ip, bam, hdfs_bam, memory=inputs.memory)
Example #4
0
def upload_data(job, master_ip, inputs, hdfs_name, upload_name, spark_on_toil):
    """
    Upload file hdfsName from hdfs to s3
    """

    _log.info("Uploading output BAM %s to %s.", hdfs_name, upload_name)
    call_conductor(job,
                   master_ip,
                   hdfs_name,
                   upload_name,
                   memory=inputs.memory,
                   container='fnothaft/conductor')
    remove_file(master_ip, hdfs_name, spark_on_toil)
def upload_data(job, master_ip, inputs, hdfs_name, upload_name, spark_on_toil):
    """
    Upload file hdfsName from hdfs to s3
    """

    if mock_mode():
        truncate_file(master_ip, hdfs_name, spark_on_toil)

    log.info("Uploading output BAM %s to %s.", hdfs_name, upload_name)
    call_conductor(job,
                   master_ip,
                   hdfs_name,
                   upload_name,
                   memory=inputs.memory)
    remove_file(master_ip, hdfs_name, spark_on_toil)
Example #6
0
def download_count_upload(job,
                          master_ip,
                          input_file,
                          output_file,
                          kmer_length,
                          spark_conf,
                          memory,
                          sudo):
    '''
    Runs k-mer counting.

    1. If the input file is located in S3, the file is copied into HDFS.
    2. If the input file is not in Parquet format, the file is converted into Parquet.
    3. The k-mers are counted and saved as text.
    4. If the output path is an S3 URL, the file is copied back to S3.

    :param job: Toil job
    :param input_file: URL/path to input file to count k-mers on
    :param output_file: URL/path to save k-mer counts at
    :param kmer_length: The length of k-mer substrings to count.
    :param spark_conf: Optional Spark configuration. If set, memory should \
    not be set.
    :param memory: Amount of memory to provided to Spark workers. Must be set \
    if spark_conf is not set.
    :param sudo: Whether or not to run Spark containers with sudo.

    :type job: toil.Job
    :type input_file: string
    :type output_file: string
    :type kmer_length: int or string
    :type spark_conf: list of string or None
    :type memory: int or None
    :type sudo: boolean
    '''

    if master_ip is not None:
        hdfs_dir = "hdfs://{0}:{1}/".format(master_ip, HDFS_MASTER_PORT)
    else:
        _log.warn('Master IP is not set. If default filesystem is not set, jobs may fail.')
        hdfs_dir = ""

    # if the file isn't already in hdfs, copy it in
    hdfs_input_file = hdfs_dir
    if input_file.startswith("s3://"):

        # append the s3 file name to our hdfs path
        hdfs_input_file += input_file.split("/")[-1]

        # run the download
        _log.info("Downloading input file %s to %s.", input_file, hdfs_input_file)
        call_conductor(master_ip, input_file, hdfs_input_file,
                       memory=memory, override_parameters=spark_conf)

    else:
        if not input_file.startswith("hdfs://"):
            _log.warn("If not in S3, input file (%s) expected to be in HDFS (%s).",
                      input_file, hdfs_dir)

    # where are we writing the output to? is it going to a location in hdfs or not?
    run_upload = True
    hdfs_output_file = hdfs_dir + "kmer_output.txt"
    if output_file.startswith(hdfs_dir):
        run_upload = False
        hdfs_output_file = output_file
    
    # do we need to convert to adam?
    if (hdfs_input_file.endswith('.bam') or
        hdfs_input_file.endswith('.sam') or
        hdfs_input_file.endswith('.fq') or
        hdfs_input_file.endswith('.fastq')):
        
        hdfs_tmp_file = hdfs_input_file

        # change the file extension to adam
        hdfs_input_file = '.'.join(hdfs_input_file.split('.')[:-1].append('adam'))

        # convert the file
        _log.info('Converting %s into ADAM format at %s.', hdfs_tmp_file, hdfs_input_file)
        call_adam(master_ip,
                  ['transform',
                   hdfs_tmp_file, hdfs_input_file],
                  memory=memory, override_parameters=spark_conf)
        
    # run k-mer counting
    _log.info('Counting %d-mers in %s, and saving to %s.',
              kmer_length, hdfs_input_file, hdfs_output_file)
    call_adam(master_ip,
              ['count_kmers',
               hdfs_input_file, hdfs_output_file,
               str(kmer_length)],
              memory=memory, override_parameters=spark_conf)

    # do we need to upload the file back? if so, run upload
    if run_upload:
        _log.info("Uploading output file %s to %s.", hdfs_output_file, output_file)
        call_conductor(master_ip, hdfs_output_file, output_file,
                       memory=memory, override_parameters=spark_conf)
Example #7
0
def download_run_and_upload(job, master_ip, inputs, spark_on_toil):
    """
    Monolithic job that calls data download, conversion, call, upload.
    """
    master_ip = MasterAddress(master_ip)

    bam_name = inputs.sample.split('://')[-1].split('/')[-1]
    sample_name = ".".join(os.path.splitext(bam_name)[:-1])

    hdfs_subdir = sample_name + "-dir"

    if inputs.run_local:
        inputs.local_dir = job.fileStore.getLocalTempDir()
        hdfs_dir = inputs.local_dir
    else:
        inputs.local_dir = None
        hdfs_dir = "hdfs://{0}:{1}/{2}".format(master_ip, HDFS_MASTER_PORT,
                                               hdfs_subdir)

    try:
        hdfs_prefix = hdfs_dir + "/" + sample_name
        hdfs_bam = hdfs_dir + "/" + bam_name

        if not inputs.run_local:
            _log.info("Downloading input BAM %s to %s.", bam_name, hdfs_bam)
            call_conductor(job,
                           master_ip,
                           inputs.sample,
                           hdfs_bam,
                           container='fnothaft/conductor',
                           memory=inputs.memory)
        else:
            copy_files([inputs.sample], inputs.local_dir)

        adam_input = hdfs_prefix + ".adam"
        _log.info("Converting input BAM to ADAM.")
        call_adam(job,
                  master_ip, ["transform", hdfs_bam, adam_input],
                  memory=inputs.memory,
                  run_local=inputs.run_local,
                  container='fnothaft/adam')

        avocado_output = hdfs_prefix + ".gt.adam"
        _log.info("Calling variants with avocado.")
        call_avocado(
            job,
            master_ip,
            ["biallelicGenotyper", "-is_not_grc", adam_input, avocado_output],
            memory=inputs.memory,
            container='fnothaft/avocado')

        output_vcf = hdfs_prefix + ".vcf"
        _log.info("Converting output ADAM Genotypes to VCF.")
        call_adam(job,
                  master_ip, [
                      "adam2vcf", avocado_output, output_vcf, "-single",
                      "-sort_on_save", "-stringency", "LENIENT"
                  ],
                  memory=inputs.memory,
                  run_local=inputs.run_local,
                  container='fnothaft/adam')

        out_file = inputs.output_dir + "/" + sample_name + inputs.suffix + ".bam"

        if not inputs.run_local:
            _log.info("Uploading output VCF %s to %s.", output_vcf, out_file)
            call_conductor(job,
                           master_ip,
                           output_vcf,
                           out_file,
                           memory=inputs.memory,
                           container='fnothaft/conductor')
            remove_file(master_ip, output_vcf, spark_on_toil)
        else:
            local_adam_output = "%s/%s.processed.bam" % (inputs.local_dir,
                                                         sample_name)
            move_files([local_adam_output], inputs.output_dir)

        remove_file(master_ip, hdfs_subdir, spark_on_toil)
    except:
        remove_file(master_ip, hdfs_subdir, spark_on_toil)
        raise
Example #8
0
def download_count_upload(job, master_ip, input_file, output_file, kmer_length,
                          spark_conf, memory, sudo):
    '''
    Runs k-mer counting.

    1. If the input file is located in S3, the file is copied into HDFS.
    2. If the input file is not in Parquet format, the file is converted into Parquet.
    3. The k-mers are counted and saved as text.
    4. If the output path is an S3 URL, the file is copied back to S3.

    :param job: Toil job
    :param input_file: URL/path to input file to count k-mers on
    :param output_file: URL/path to save k-mer counts at
    :param kmer_length: The length of k-mer substrings to count.
    :param spark_conf: Optional Spark configuration. If set, memory should \
    not be set.
    :param memory: Amount of memory to provided to Spark workers. Must be set \
    if spark_conf is not set.
    :param sudo: Whether or not to run Spark containers with sudo.

    :type job: toil.Job
    :type input_file: string
    :type output_file: string
    :type kmer_length: int or string
    :type spark_conf: list of string or None
    :type memory: int or None
    :type sudo: boolean
    '''

    if master_ip is not None:
        hdfs_dir = "hdfs://{0}:{1}/".format(master_ip, HDFS_MASTER_PORT)
    else:
        _log.warn(
            'Master IP is not set. If default filesystem is not set, jobs may fail.'
        )
        hdfs_dir = ""

    # if the file isn't already in hdfs, copy it in
    hdfs_input_file = hdfs_dir
    if input_file.startswith("s3://"):

        # append the s3 file name to our hdfs path
        hdfs_input_file += input_file.split("/")[-1]

        # run the download
        _log.info("Downloading input file %s to %s.", input_file,
                  hdfs_input_file)
        call_conductor(job,
                       master_ip,
                       input_file,
                       hdfs_input_file,
                       memory=memory,
                       override_parameters=spark_conf)

    else:
        if not input_file.startswith("hdfs://"):
            _log.warn(
                "If not in S3, input file (%s) expected to be in HDFS (%s).",
                input_file, hdfs_dir)

    # where are we writing the output to? is it going to a location in hdfs or not?
    run_upload = True
    hdfs_output_file = hdfs_dir + "kmer_output.txt"
    if output_file.startswith(hdfs_dir):
        run_upload = False
        hdfs_output_file = output_file

    # do we need to convert to adam?
    if (hdfs_input_file.endswith('.bam') or hdfs_input_file.endswith('.sam')
            or hdfs_input_file.endswith('.fq')
            or hdfs_input_file.endswith('.fastq')):

        hdfs_tmp_file = hdfs_input_file

        # change the file extension to adam
        hdfs_input_file = '.'.join(
            hdfs_input_file.split('.')[:-1].append('adam'))

        # convert the file
        _log.info('Converting %s into ADAM format at %s.', hdfs_tmp_file,
                  hdfs_input_file)
        call_adam(job,
                  master_ip, ['transform', hdfs_tmp_file, hdfs_input_file],
                  memory=memory,
                  override_parameters=spark_conf)

    # run k-mer counting
    _log.info('Counting %d-mers in %s, and saving to %s.', kmer_length,
              hdfs_input_file, hdfs_output_file)
    call_adam(
        job,
        master_ip,
        ['count_kmers', hdfs_input_file, hdfs_output_file,
         str(kmer_length)],
        memory=memory,
        override_parameters=spark_conf)

    # do we need to upload the file back? if so, run upload
    if run_upload:
        _log.info("Uploading output file %s to %s.", hdfs_output_file,
                  output_file)
        call_conductor(job,
                       master_ip,
                       hdfs_output_file,
                       output_file,
                       memory=memory,
                       override_parameters=spark_conf)