Python ComputedFile.calculate_sizeの例

プログラミング言語: Python

名前空間/パッケージ名: data_refinery_common.models

クラス/型: ComputedFile

メソッド/関数: calculate_size

hotexamples.comのコード掲載数: 17

Python ComputedFile.calculate_size - 17件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのdata_refinery_common.models.ComputedFile.calculate_sizeの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

ComputedFile(30)

absolute_file_path(30)

calculate_size(17)

calculate_sha1(13)

compendia_organism(3)

compendia_version(3)

delete(3)

compendium_version(2)

コード例 #1

ファイルを表示

ファイル: agilent_twocolor.py プロジェクト: arjunkrish/refinebio

def _create_result_objects(job_context: Dict) -> Dict:
    """ Create the ComputationalResult objects after a Scan run is complete """

    result = ComputationalResult()
    result.commands.append("SCAN.UPC::SCAN_TwoColor")
    result.is_ccdl = True
    result.is_public = True
    result.time_start = job_context["time_start"]
    result.time_end = job_context["time_end"]
    try:
        processor_key = "AGILENT_TWOCOLOR"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)

    result.save()
    job_context["pipeline"].steps.append(result.id)

    # Create a ComputedFile for the sample,
    # sync it S3 and save it.
    try:
        computed_file = ComputedFile()
        computed_file.absolute_file_path = job_context["output_file_path"]
        computed_file.filename = os.path.split(
            job_context["output_file_path"])[-1]
        computed_file.calculate_sha1()
        computed_file.calculate_size()
        computed_file.result = result
        computed_file.is_smashable = True
        computed_file.is_qc = False
        computed_file.save()
        job_context["computed_files"].append(computed_file)
    except Exception:
        logger.exception(
            "Exception caught while moving file %s to S3",
            computed_file.filename,
            processor_job=job_context["job_id"],
        )
        failure_reason = "Exception caught while moving file to S3"
        job_context["job"].failure_reason = failure_reason
        job_context["success"] = False
        return job_context

    for sample in job_context["samples"]:
        assoc = SampleResultAssociation()
        assoc.sample = sample
        assoc.result = result
        assoc.save()

        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample, computed_file=computed_file)

    logger.info("Created %s", result)
    job_context["success"] = True

    return job_context

コード例 #2

ファイルを表示

ファイル: qn_reference.py プロジェクト: modulexcite/refinebio

def _create_result_objects(job_context: Dict) -> Dict:

    result = ComputationalResult()
    result.commands.append(" ".join(job_context['formatted_command']))
    result.is_ccdl = True
    result.is_public = True
    result.time_start = job_context['time_start']
    result.time_end = job_context['time_end']
    try:
        processor_key = "QN_REFERENCE"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)
    result.save()

    computed_file = ComputedFile()
    computed_file.absolute_file_path = job_context['target_file']
    computed_file.filename = job_context['target_file'].split('/')[-1]
    computed_file.calculate_sha1()
    computed_file.calculate_size()
    computed_file.is_smashable = False
    computed_file.is_qn_target = True
    computed_file.result = result
    computed_file.save()

    annotation = ComputationalResultAnnotation()
    annotation.result = result
    annotation.data = {
        "organism_id":
        job_context['samples']['ALL'][0].organism_id,
        "is_qn":
        True,
        "platform_accession_code":
        job_context['samples']['ALL'][0].platform_accession_code,
        "samples":
        [sample.accession_code for sample in job_context["samples"]["ALL"]],
        "geneset":
        str(job_context["geneset"]),
        "num_valid_inputs":
        job_context["num_valid_inputs"]
    }
    annotation.save()

    # TODO: upload this to a public read bucket.
    # https://github.com/AlexsLemonade/refinebio/issues/586
    job_context['result'] = result
    job_context['computed_files'] = [computed_file]
    job_context['annotation'] = annotation
    job_context['success'] = True
    return job_context

コード例 #3

ファイルを表示

def _create_result_objects(job_context: Dict) -> Dict:
    if not job_context["create_results"]:
        return job_context

    result = ComputationalResult()
    result.commands.append(" ".join(job_context["formatted_command"]))
    result.is_ccdl = True
    result.is_public = True
    result.time_start = job_context["time_start"]
    result.time_end = job_context["time_end"]
    try:
        processor_key = "QN_REFERENCE"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)
    result.save()

    computed_file = ComputedFile()
    computed_file.absolute_file_path = job_context["target_file"]
    computed_file.filename = job_context["target_file"].split("/")[-1]
    computed_file.calculate_sha1()
    computed_file.calculate_size()
    computed_file.is_smashable = False
    computed_file.is_qn_target = True
    computed_file.result = result
    computed_file.save()

    annotation = ComputationalResultAnnotation()
    annotation.result = result
    annotation.data = {
        "organism_id": job_context["samples"]["ALL"][0].organism_id,
        "is_qn": True,
        "platform_accession_code": job_context["samples"]["ALL"][0].platform_accession_code,
        "samples": [sample.accession_code for sample in job_context["samples"]["ALL"]],
        "geneset": str(job_context["geneset"]),
        "num_valid_inputs": job_context["num_valid_inputs"],
    }
    annotation.save()

    job_context["result"] = result
    job_context["computed_files"] = [computed_file]
    job_context["annotation"] = annotation
    job_context["success"] = True
    return job_context

コード例 #4

ファイルを表示

def _create_result_objects(job_context: Dict) -> Dict:
    """ Create the ComputationalResult objects after a Scan run is complete """

    result = ComputationalResult()
    result.commands.append('SCAN.UPC::SCANfast')
    result.is_ccdl = True
    result.is_public = True

    result.time_start = job_context['time_start']
    result.time_end = job_context['time_end']
    try:
        processor_key = "AFFYMETRIX_SCAN"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)

    result.save()
    job_context['pipeline'].steps.append(result.id)

    # Create a ComputedFile for the sample
    computed_file = ComputedFile()
    computed_file.absolute_file_path = job_context["output_file_path"]
    computed_file.filename = os.path.split(job_context["output_file_path"])[-1]
    computed_file.calculate_sha1()
    computed_file.calculate_size()
    computed_file.result = result
    computed_file.is_smashable = True
    computed_file.is_qc = False
    computed_file.save()
    job_context['computed_files'].append(computed_file)

    for sample in job_context['samples']:
        assoc = SampleResultAssociation()
        assoc.sample = sample
        assoc.result = result
        assoc.save()

        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample, computed_file=computed_file)

    logger.debug("Created %s", result, processor_job=job_context["job_id"])
    job_context["success"] = True

    return job_context

コード例 #5

ファイルを表示

ファイル: no_op.py プロジェクト: arjunkrish/refinebio

def _create_result(job_context: Dict) -> Dict:
    """ Create the actual Result object"""

    # This is a NO-OP, but we make a ComputationalResult regardless.
    result = ComputationalResult()
    result.commands.append(job_context["script_name"])
    result.is_ccdl = True
    try:
        processor_key = "SUBMITTER_PROCESSED"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)

    result.save()
    job_context["pipeline"].steps.append(result.id)

    # Create a ComputedFile for the computed file,
    # sync it S3 and save it.
    computed_file = ComputedFile()
    computed_file.absolute_file_path = job_context["output_file_path"]
    computed_file.filename = job_context["output_file_path"].split("/")[-1]
    computed_file.calculate_sha1()
    computed_file.calculate_size()
    computed_file.result = result
    computed_file.is_smashable = True
    computed_file.is_qc = False
    computed_file.save()

    # utils.end_job will sync this to S3 for us.
    job_context["computed_files"] = [computed_file]

    for sample in job_context["samples"]:
        assoc = SampleResultAssociation()
        assoc.sample = sample
        assoc.result = result
        assoc.save()

        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample, computed_file=computed_file
        )

    logger.debug("Created %s", result)
    job_context["success"] = True
    return job_context

コード例 #6

ファイルを表示

ファイル: create_compendia.py プロジェクト: Quiltomics/refinebio

def _create_result_objects(job_context: Dict) -> Dict:
    """
    Store and host the result as a ComputationalResult object.
    """

    result = ComputationalResult()
    result.commands.append(" ".join(job_context['formatted_command']))
    result.is_ccdl = True
    result.is_public = True
    result.time_start = job_context['time_start']
    result.time_end = job_context['time_end']
    try:
        processor_key = "COMPENDIA"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)
    result.save()

    # Write the compendia dataframe to a file, overwriting the previous smash
    job_context['merged_qn'].to_csv(job_context['smash_outfile'], sep='\t', encoding='utf-8')
    compendia_tsv_computed_file = ComputedFile()
    compendia_tsv_computed_file.absolute_file_path = job_context['smash_outfile']
    compendia_tsv_computed_file.filename = job_context['smash_outfile'].split('/')[-1]
    compendia_tsv_computed_file.calculate_sha1()
    compendia_tsv_computed_file.calculate_size()
    compendia_tsv_computed_file.is_smashable = False
    compendia_tsv_computed_file.is_qn_target = False
    compendia_tsv_computed_file.result = result
    compendia_tsv_computed_file.save()

    organism_key = list(job_context['samples'].keys())[0]
    annotation = ComputationalResultAnnotation()
    annotation.result = result

    annotation.data = {
        "organism_id": job_context['samples'][organism_key][0].organism_id,
        "organism_name": job_context['samples'][organism_key][0].organism.name,
        "is_qn": False,
        "is_compendia": True,
        "samples": [sample.accession_code for sample in job_context["samples"][organism_key]],
        "num_samples": len(job_context["samples"][organism_key]),
        "experiment_accessions": [e.accession_code for e in job_context['experiments']]
    }
    annotation.save()

    # Save the related metadata file
    metadata_computed_file = ComputedFile()
    metadata_computed_file.absolute_file_path = job_context['metadata_tsv_paths'][0]
    metadata_computed_file.filename = job_context['metadata_tsv_paths'][0].split('/')[-1]
    metadata_computed_file.calculate_sha1()
    metadata_computed_file.calculate_size()
    metadata_computed_file.is_smashable = False
    metadata_computed_file.is_qn_target = False
    metadata_computed_file.result = result
    metadata_computed_file.save()

    # Create the resulting archive
    final_zip_base = "/home/user/data_store/smashed/" + str(job_context["dataset"].pk) + "_compendia"
    archive_path = shutil.make_archive(final_zip_base, 'zip', job_context["output_dir"])

    # Save the related metadata file
    organism = job_context['samples'][organism_key][0].organism

    try:
        last_compendia = ComputedFile.objects.filter(
                                    is_compendia=True,
                                    compendia_organism=organism).order_by('-compendia_version')[-1]
        compendia_version = last_compendia.compendia_version + 1
    except Exception as e:
        # This is the first compendia for this Organism
        compendia_version = 1

    archive_computed_file = ComputedFile()
    archive_computed_file.absolute_file_path = archive_path
    archive_computed_file.filename = archive_path.split('/')[-1]
    archive_computed_file.calculate_sha1()
    archive_computed_file.calculate_size()
    archive_computed_file.is_smashable = False
    archive_computed_file.is_qn_target = False
    archive_computed_file.result = result
    archive_computed_file.is_compendia = True
    archive_computed_file.compendia_organism = job_context['samples'][organism_key][0].organism
    archive_computed_file.compendia_version = compendia_version
    archive_computed_file.save()

    logger.info("Compendia created!",
        archive_path=archive_path,
        organism_name=job_context['samples'][organism_key][0].organism.name
    )

    # Upload the result to S3
    key = job_context['samples'][organism_key][0].organism.name + "_" + str(compendia_version) + "_" + str(int(time.time())) + ".zip"
    archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, key)

    job_context['result'] = result
    job_context['computed_files'] = [compendia_tsv_computed_file, metadata_computed_file, archive_computed_file]
    job_context['success'] = True

    return job_context

コード例 #7

ファイルを表示

def _create_result_objects(job_context: Dict) -> Dict:
    """ Create the ComputationalResult objects after a Scan run is complete """

    result = ComputationalResult()
    result.commands.append(job_context['formatted_command'])
    result.is_ccdl = True
    result.is_public = True
    result.time_start = job_context['time_start']
    result.time_end = job_context['time_end']
    try:
        processor_key = "ILLUMINA_SCAN"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)

    result.save()
    job_context['pipeline'].steps.append(result.id)

    # Split the result into smashable subfiles
    big_tsv = job_context["output_file_path"]
    data = pd.read_csv(big_tsv, sep='\t', header=0, index_col=0)
    individual_files = []
    frames = np.split(data, len(data.columns), axis=1)
    for frame in frames:
        filename = frame.columns.values[0].replace('&', '').replace(
            "*", '').replace(";", '') + '.tsv'
        frame_path = job_context["work_dir"] + filename
        frame.to_csv(frame_path, sep='\t', encoding='utf-8')

        # This needs to be the same as the ones in the job context!
        try:
            sample = job_context['samples'].get(title=frame.columns.values[0])
        except Sample.DoesNotExist:
            logger.error(
                "Could not find sample for column while splitting Illumina file.",
                title=frame.columns.values[0],
                processor_job=job_context["job_id"],
                file_path=big_tsv,
            )
            continue

        computed_file = ComputedFile()
        computed_file.absolute_file_path = frame_path
        computed_file.filename = frame_path.split('/')[-1]
        computed_file.result = result
        computed_file.is_smashable = True
        computed_file.is_qc = False
        computed_file.is_public = True
        computed_file.calculate_sha1()
        computed_file.calculate_size()
        computed_file.save()
        job_context['computed_files'].append(computed_file)

        SampleResultAssociation.objects.get_or_create(sample=sample,
                                                      result=result)

        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample, computed_file=computed_file)

        individual_files.append(computed_file)

    logger.debug("Created %s", result)
    job_context["success"] = True
    job_context["individual_files"] = individual_files
    job_context["result"] = result

    return job_context

コード例 #8

ファイルを表示

ファイル: salmon.py プロジェクト: Quiltomics/refinebio

def _run_salmon(job_context: Dict) -> Dict:
    """Runs Salmon Quant."""
    logger.debug("Running Salmon..")

    # Salmon needs to be run differently for different sample types.
    if "input_file_path_2" in job_context:
        second_read_str = " -2 {}".format(job_context["input_file_path_2"])

        # Rob recommends 16 threads/process, which fits snugly on an x1 at 8GB RAM per Salmon container:
        # (2 threads/core * 16 cores/socket * 64 vCPU) / (1TB/8GB) = ~17
        command_str = ("salmon --no-version-check quant -l A --biasSpeedSamp 5 -i {index}"
                       " -1 {input_one}{second_read_str} -p 16 -o {output_directory}"
                       " --seqBias --gcBias --dumpEq --writeUnmappedNames")

        formatted_command = command_str.format(index=job_context["index_directory"],
                                               input_one=job_context["input_file_path"],
                                               second_read_str=second_read_str,
                                               output_directory=job_context["output_directory"])
    else:
        # Related: https://github.com/COMBINE-lab/salmon/issues/83
        command_str = ("salmon --no-version-check quant -l A -i {index}"
                       " -r {input_one} -p 16 -o {output_directory}"
                       " --seqBias --dumpEq --writeUnmappedNames")

        formatted_command = command_str.format(index=job_context["index_directory"],
                                               input_one=job_context["input_file_path"],
                                               output_directory=job_context["output_directory"])

    logger.debug("Running Salmon Quant using the following shell command: %s",
                 formatted_command,
                 processor_job=job_context["job_id"])

    job_context['time_start'] = timezone.now()
    completed_command = subprocess.run(formatted_command.split(),
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
    job_context['time_end'] = timezone.now()

    ## To me, this looks broken: error codes are anything non-zero.
    ## However, Salmon (seems) to output with negative status codes
    ## even with successful executions.
    ## Possibly related: https://github.com/COMBINE-lab/salmon/issues/55
    if completed_command.returncode == 1:
        stderr = completed_command.stderr.decode().strip()
        error_start = stderr.upper().find("ERROR:")
        error_start = error_start if error_start != -1 else 0
        logger.error("Shell call to salmon failed with error message: %s",
                     stderr[error_start:],
                     processor_job=job_context["job_id"])

        job_context["job"].failure_reason = ("Shell call to salmon failed because: "
                                             + stderr[error_start:])
        job_context["success"] = False
    else:
        result = ComputationalResult()
        result.commands.append(formatted_command)
        result.time_start = job_context['time_start']
        result.time_end = job_context['time_end']
        result.organism_index = job_context["organism_index"]
        result.is_ccdl = True

        try:
            processor_key = "SALMON_QUANT"
            result.processor = utils.find_processor(processor_key)
        except Exception as e:
            return utils.handle_processor_exception(job_context, processor_key, e)

        # Zip up the output of Salmon Quant
        try:
            with tarfile.open(job_context['output_archive'], "w:gz") as tar:
                tar.add(job_context["output_directory"], arcname=os.sep)
        except Exception:
            logger.exception("Exception caught while zipping processed directory %s",
                             job_context["output_directory"],
                             processor_job=job_context["job_id"]
            )
            failure_template = "Exception caught while zipping processed directory {}"
            job_context["job"].failure_reason = failure_template.format(job_context['output_archive'])
            job_context["success"] = False
            return job_context

        salmon_quant_archive = ComputedFile()
        salmon_quant_archive.absolute_file_path = job_context["output_archive"]
        salmon_quant_archive.filename = os.path.split(job_context["output_archive"])[-1]
        salmon_quant_archive.calculate_sha1()
        salmon_quant_archive.calculate_size()
        salmon_quant_archive.is_public = True
        salmon_quant_archive.is_smashable = False
        salmon_quant_archive.is_qc = False

        quant_file = ComputedFile()
        quant_file.s3_bucket = S3_BUCKET_NAME
        quant_file.s3_key = "quant_files/sample_" + str(job_context["sample"].id) + "_quant.sf"
        quant_file.filename = "quant.sf"
        quant_file.absolute_file_path = job_context["output_directory"] + "quant.sf"
        quant_file.is_public = False
        quant_file.is_smashable = False
        quant_file.is_qc = False
        quant_file.calculate_sha1()
        quant_file.calculate_size()

        # If we're running in the cloud we need to upload the quant.sf
        # file so that it can be used by a job running on any machine
        # to run tximport. We can't use sync_to_s3 though because we
        # have to sync it before we can save the file so it cannot be
        # discovered by other jobs before it is uploaded.
        if settings.RUNNING_IN_CLOUD:
            try:
                S3.upload_file(
                    quant_file.absolute_file_path,
                    quant_file.s3_bucket,
                    quant_file.s3_key,
                    ExtraArgs={
                        'ACL': 'public-read',
                        'StorageClass': 'STANDARD_IA'
                    }
                )
            except Exception as e:
                logger.exception(e, processor_job=job_context["job_id"], sample=job_context["sample"].id)
                failure_template = "Exception caught while uploading quantfile to S3: {}"
                job_context["job"].failure_reason = failure_template.format(quant_file.absolute_file_path)
                job_context["success"] = False
                return job_context

        # Here select_for_update() is used as a mutex that forces multiple
        # jobs to execute this block of code in serial manner. See:
        # https://docs.djangoproject.com/en/1.11/ref/models/querysets/#select-for-update
        # Theorectically any rows in any table can be locked here, we're
        # locking all existing rows in ComputationalResult table.
        with transaction.atomic():
            ComputationalResult.objects.select_for_update()
            result.save()
            job_context["quant_result"] = result
            quant_file.result = result
            quant_file.save()

            job_context["result"] = result

            job_context['pipeline'].steps.append(result.id)
            SampleResultAssociation.objects.get_or_create(sample=job_context['sample'],
                                                          result=result)

            salmon_quant_archive.result = result
            salmon_quant_archive.save()
            job_context['computed_files'].append(salmon_quant_archive)

            tximport_inputs = _get_tximport_inputs(job_context)

        # tximport analysis is done outside of the transaction so that
        # the mutex wouldn't hold the other jobs too long.
        for experiment, quant_files in tximport_inputs.items():
            _tximport(job_context, experiment, quant_files)
            # If `tximport` on any related experiment fails, exit immediately.
            if not job_context["success"]:
                return job_context

        kv = ComputationalResultAnnotation()
        kv.data = {"index_length": job_context["index_length"]}
        kv.result = result
        kv.is_public = True
        kv.save()

        with open(os.path.join(job_context['output_directory'], 'lib_format_counts.json')) as lfc_file:
            format_count_data = json.load(lfc_file)
            kv = ComputationalResultAnnotation()
            kv.data = format_count_data
            kv.result = result
            kv.is_public = True
            kv.save()
        with open(os.path.join(job_context['output_directory'], 'aux_info', 'meta_info.json')) as mi_file:
            meta_info = json.load(mi_file)
            kv = ComputationalResultAnnotation()
            kv.data = meta_info
            kv.result = result
            kv.is_public = True
            kv.save()

        job_context["success"] = True

    return job_context

コード例 #9

ファイルを表示

ファイル: create_compendia.py プロジェクト: erflynn/refinebio

def _create_result_objects(job_context: Dict) -> Dict:
    """
    Store and host the result as a ComputationalResult object.
    """
    result_start = log_state("start create result object",
                             job_context["job"].id)
    result = ComputationalResult()
    result.commands.append(" ".join(job_context["formatted_command"]))
    result.is_ccdl = True
    # Temporary until we re-enable the QN test step.
    result.is_public = False
    result.time_start = job_context["time_start"]
    result.time_end = job_context["time_end"]
    try:
        processor_key = "CREATE_COMPENDIA"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)
    result.save()

    # Write the compendia dataframe to a file
    job_context["csv_outfile"] = job_context["output_dir"] + job_context[
        "organism_name"] + ".tsv"
    job_context["merged_qn"].to_csv(job_context["csv_outfile"],
                                    sep="\t",
                                    encoding="utf-8")

    organism_key = list(job_context["samples"].keys())[0]
    annotation = ComputationalResultAnnotation()
    annotation.result = result

    annotation.data = {
        "organism_id":
        job_context["samples"][organism_key][0].organism_id,
        "organism_name":
        job_context["organism_name"],
        "is_qn":
        False,
        "is_compendia":
        True,
        "samples": [
            sample.accession_code
            for sample in job_context["samples"][organism_key]
        ],
        "num_samples":
        len(job_context["samples"][organism_key]),
        "experiment_accessions":
        [e.accession_code for e in job_context["experiments"]],
        "total_percent_imputed":
        job_context["total_percent_imputed"],
    }
    annotation.save()

    # Create the resulting archive
    final_zip_base = SMASHING_DIR + str(
        job_context["dataset"].pk) + "_compendia"
    # Copy LICENSE.txt and correct README.md files.
    if job_context["dataset"].quant_sf_only:
        readme_file = "/home/user/README_QUANT.md"
    else:
        readme_file = "/home/user/README_NORMALIZED.md"

    shutil.copy(readme_file, job_context["output_dir"] + "/README.md")
    shutil.copy("/home/user/LICENSE_DATASET.txt",
                job_context["output_dir"] + "/LICENSE.TXT")
    archive_path = shutil.make_archive(final_zip_base, "zip",
                                       job_context["output_dir"])

    archive_computed_file = ComputedFile()
    archive_computed_file.absolute_file_path = archive_path
    archive_computed_file.filename = archive_path.split("/")[-1]
    archive_computed_file.calculate_sha1()
    archive_computed_file.calculate_size()
    archive_computed_file.is_smashable = False
    archive_computed_file.is_qn_target = False
    archive_computed_file.result = result
    archive_computed_file.save()

    # Compendia Result Helpers
    primary_organism = Organism.get_object_for_name(
        job_context["primary_organism"])
    organisms = [
        Organism.get_object_for_name(organism)
        for organism in job_context["all_organisms"]
    ]
    compendium_version = (CompendiumResult.objects.filter(
        primary_organism=primary_organism, quant_sf_only=False).count() + 1)
    # Save Compendia Result
    compendium_result = CompendiumResult()
    compendium_result.quant_sf_only = job_context["dataset"].quant_sf_only
    compendium_result.svd_algorithm = job_context["dataset"].svd_algorithm
    compendium_result.compendium_version = compendium_version
    compendium_result.result = result
    compendium_result.primary_organism = primary_organism
    compendium_result.save()

    # create relations to all organisms contained in the compendia

    compendium_result_organism_associations = []
    for compendium_organism in organisms:
        compendium_result_organism_association = CompendiumResultOrganismAssociation(
        )
        compendium_result_organism_association.compendium_result = compendium_result
        compendium_result_organism_association.organism = compendium_organism
        compendium_result_organism_associations.append(
            compendium_result_organism_association)

    CompendiumResultOrganismAssociation.objects.bulk_create(
        compendium_result_organism_associations)

    job_context["compendium_result"] = compendium_result

    logger.info("Compendium created!",
                archive_path=archive_path,
                organism_name=job_context["organism_name"])

    # Upload the result to S3
    timestamp = str(int(time.time()))
    key = job_context["organism_name"] + "_" + str(
        compendium_version) + "_" + timestamp + ".zip"
    uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME,
                                                      key)

    if not uploaded_to_s3:
        raise utils.ProcessorJobError(
            "Failed to upload compendia to S3",
            success=False,
            computed_file_id=archive_computed_file.id,
        )

    if settings.RUNNING_IN_CLOUD:
        archive_computed_file.delete_local_file()

    job_context["result"] = result
    job_context["success"] = True

    log_state("end create result object", job_context["job"].id, result_start)

    # TEMPORARY for iterating on compendia more quickly.
    # Reset this so the end_job does clean up the job's non-input-data stuff.
    job_context["work_dir"] = job_context["old_work_dir"]

    return job_context

コード例 #10

ファイルを表示

ファイル: illumina.py プロジェクト: arjunkrish/refinebio

def _create_result_objects(job_context: Dict) -> Dict:
    """ Create the ComputationalResult objects after a Scan run is complete """

    result = ComputationalResult()
    result.commands.append(job_context["formatted_command"])
    result.is_ccdl = True
    result.is_public = True
    result.time_start = job_context["time_start"]
    result.time_end = job_context["time_end"]
    try:
        processor_key = "ILLUMINA_SCAN"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)

    result.save()
    job_context["pipeline"].steps.append(result.id)

    # Split the result into smashable subfiles
    big_tsv = job_context["output_file_path"]
    data = pd.read_csv(big_tsv, sep="\t", header=0, index_col=0)
    individual_files = []
    frames = np.split(data, len(data.columns), axis=1)
    for frame in frames:
        filename = (
            frame.columns.values[0].replace("&", "").replace("*", "").replace(";", "") + ".tsv"
        )
        frame_path = job_context["work_dir"] + filename
        frame.to_csv(frame_path, sep="\t", encoding="utf-8")

        # This needs to be the same as the ones in the job context!
        sample = _get_sample_for_column(frame.columns.values[0], job_context)
        if sample is None:
            job_context["job"].failure_reason = (
                "Could not find sample for column "
                + frame.columns.values[0]
                + " while splitting Illumina file "
                + big_tsv
            )
            job_context["success"] = False
            job_context["job"].no_retry = True
            return job_context

        computed_file = ComputedFile()
        computed_file.absolute_file_path = frame_path
        computed_file.filename = frame_path.split("/")[-1]
        computed_file.result = result
        computed_file.is_smashable = True
        computed_file.is_qc = False
        computed_file.is_public = True
        computed_file.calculate_sha1()
        computed_file.calculate_size()
        computed_file.save()
        job_context["computed_files"].append(computed_file)

        SampleResultAssociation.objects.get_or_create(sample=sample, result=result)

        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample, computed_file=computed_file
        )

        individual_files.append(computed_file)

    logger.debug("Created %s", result)
    job_context["success"] = True
    job_context["individual_files"] = individual_files
    job_context["result"] = result

    return job_context

コード例 #11

ファイルを表示

ファイル: salmon.py プロジェクト: arjunkrish/refinebio

def _run_salmontools(job_context: Dict) -> Dict:
    """ Run Salmontools to extract unmapped genes. """

    logger.debug("Running SalmonTools ...")
    unmapped_filename = job_context[
        "output_directory"] + "aux_info/unmapped_names.txt"

    command_str = "salmontools extract-unmapped -u {unmapped_file} -o {output} "
    output_prefix = job_context["salmontools_directory"] + "unmapped_by_salmon"
    command_str = command_str.format(unmapped_file=unmapped_filename,
                                     output=output_prefix)
    if "input_file_path_2" in job_context:
        command_str += "-1 {input_1} -2 {input_2}"
        command_str = command_str.format(
            input_1=job_context["input_file_path"],
            input_2=job_context["input_file_path_2"])
    else:
        command_str += "-r {input_1}"
        command_str = command_str.format(
            input_1=job_context["input_file_path"])

    start_time = timezone.now()
    logger.debug(
        "Running the following SalmonTools command: %s",
        command_str,
        processor_job=job_context["job_id"],
    )

    completed_command = subprocess.run(command_str.split(),
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
    end_time = timezone.now()

    # As of SalmonTools 0.1.0, completed_command.returncode is always 0,
    # (even if error happens).  completed_command.stderr is not totally
    # reliable either, because it will output the following line even
    # when the execution succeeds:
    #  "There were <N> unmapped reads\n"
    # in which "<N>" is the number of lines in input unmapped_names.txt.
    #
    # As a workaround, we are using a regular expression here to test
    # the status of SalmonTools execution.  Any text in stderr that is
    # not in the above format is treated as error message.
    status_str = completed_command.stderr.decode().strip()
    success_pattern = r"^There were \d+ unmapped reads$"
    if re.match(success_pattern, status_str):
        # Zip up the output of salmontools
        try:
            with tarfile.open(job_context["salmontools_archive"],
                              "w:gz") as tar:
                tar.add(job_context["salmontools_directory"], arcname=os.sep)
        except Exception:
            logger.exception(
                "Exception caught while zipping processed directory %s",
                job_context["salmontools_directory"],
                processor_job=job_context["job_id"],
            )
            failure_template = "Exception caught while zipping salmontools directory {}"
            job_context["job"].failure_reason = failure_template.format(
                job_context["salmontools_archive"])
            job_context["success"] = False
            return job_context

        result = ComputationalResult()
        result.commands.append(command_str)
        result.time_start = start_time
        result.time_end = end_time
        result.is_ccdl = True

        try:
            processor_key = "SALMONTOOLS"
            result.processor = utils.find_processor(processor_key)
        except Exception as e:
            return utils.handle_processor_exception(job_context, processor_key,
                                                    e)

        result.save()
        job_context["pipeline"].steps.append(result.id)

        assoc = SampleResultAssociation()
        assoc.sample = job_context["sample"]
        assoc.result = result
        assoc.save()

        computed_file = ComputedFile()
        computed_file.filename = job_context["salmontools_archive"].split(
            "/")[-1]
        computed_file.absolute_file_path = job_context["salmontools_archive"]
        computed_file.calculate_sha1()
        computed_file.calculate_size()
        computed_file.is_public = True
        computed_file.is_smashable = False
        computed_file.is_qc = True
        computed_file.result = result
        computed_file.save()
        job_context["computed_files"].append(computed_file)

        assoc = SampleComputedFileAssociation()
        assoc.sample = job_context["sample"]
        assoc.computed_file = computed_file
        assoc.save()

        job_context["result"] = result
        job_context["success"] = True
    else:  # error in salmontools
        logger.error(
            "Shell call to salmontools failed with error message: %s",
            status_str,
            processor_job=job_context["job_id"],
        )
        job_context["job"].failure_reason = (
            "Shell call to salmontools failed because: " + status_str)
        job_context["success"] = False

    return job_context

コード例 #12

ファイルを表示

ファイル: salmon.py プロジェクト: arjunkrish/refinebio

def _run_salmon(job_context: Dict) -> Dict:
    """Runs Salmon Quant."""
    logger.debug("Running Salmon..")

    # Salmon needs to be run differently for different sample types.
    # SRA files also get processed differently as we don't want to use fasterq-dump to extract
    # them to disk.
    if job_context.get("sra_input_file_path", None):

        # Single reads
        if job_context["sra_num_reads"] == 1:

            fifo = "/tmp/barney"
            os.mkfifo(fifo)

            dump_str = "fastq-dump --stdout {input_sra_file} > {fifo} &"
            formatted_dump_command = dump_str.format(
                input_sra_file=job_context["sra_input_file_path"], fifo=fifo)
            subprocess.Popen(formatted_dump_command,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT)

            command_str = (
                "salmon --no-version-check quant -l A -i {index} "
                "-r {fifo} -p 16 -o {output_directory} --seqBias --dumpEq --writeUnmappedNames"
            )
            formatted_command = command_str.format(
                index=job_context["index_directory"],
                input_sra_file=job_context["sra_input_file_path"],
                fifo=fifo,
                output_directory=job_context["output_directory"],
            )
        # Paired are trickier
        else:

            # Okay, for some reason I can't explain, this only works in the temp directory,
            # otherwise the `tee` part will only output to one or the other of the streams (non-deterministically),
            # but not both. This doesn't appear to happen if the fifos are in tmp.
            alpha = "/tmp/alpha"
            os.mkfifo(alpha)
            beta = "/tmp/beta"
            os.mkfifo(beta)

            dump_str = "fastq-dump --stdout --split-files -I {input_sra_file} | tee >(grep '@.*\.1\s' -A3 --no-group-separator > {fifo_alpha}) >(grep '@.*\.2\s' -A3 --no-group-separator > {fifo_beta}) > /dev/null &"
            formatted_dump_command = dump_str.format(
                input_sra_file=job_context["sra_input_file_path"],
                fifo_alpha=alpha,
                fifo_beta=beta)
            subprocess.Popen(
                formatted_dump_command,
                shell=True,
                executable="/bin/bash",
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
            )

            command_str = (
                "salmon --no-version-check quant -l A -i {index} "
                "-1 {fifo_alpha} -2 {fifo_beta} -p 16 -o {output_directory} --seqBias --dumpEq --writeUnmappedNames"
            )
            formatted_command = command_str.format(
                index=job_context["index_directory"],
                input_sra_file=job_context["sra_input_file_path"],
                fifo_alpha=alpha,
                fifo_beta=beta,
                output_directory=job_context["output_directory"],
            )

    else:
        if "input_file_path_2" in job_context:
            second_read_str = " -2 {}".format(job_context["input_file_path_2"])

            # Rob recommends 16 threads/process, which fits snugly on an x1 at 8GB RAM per Salmon container:
            # (2 threads/core * 16 cores/socket * 64 vCPU) / (1TB/8GB) = ~17
            command_str = (
                "salmon --no-version-check quant -l A --biasSpeedSamp 5 -i {index}"
                " -1 {input_one}{second_read_str} -p 16 -o {output_directory}"
                " --seqBias --gcBias --dumpEq --writeUnmappedNames")

            formatted_command = command_str.format(
                index=job_context["index_directory"],
                input_one=job_context["input_file_path"],
                second_read_str=second_read_str,
                output_directory=job_context["output_directory"],
            )
        else:
            # Related: https://github.com/COMBINE-lab/salmon/issues/83
            command_str = ("salmon --no-version-check quant -l A -i {index}"
                           " -r {input_one} -p 16 -o {output_directory}"
                           " --seqBias --dumpEq --writeUnmappedNames")

            formatted_command = command_str.format(
                index=job_context["index_directory"],
                input_one=job_context["input_file_path"],
                output_directory=job_context["output_directory"],
            )

    logger.debug(
        "Running Salmon Quant using the following shell command: %s",
        formatted_command,
        processor_job=job_context["job_id"],
    )

    # Salmon probably shouldn't take longer than three hours.
    timeout = 60 * 60 * 3
    job_context["time_start"] = timezone.now()
    try:
        completed_command = subprocess.run(
            formatted_command.split(),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            timeout=timeout,
        )
    except subprocess.TimeoutExpired:
        failure_reason = "Salmon timed out because it failed to complete within 3 hours."
        logger.error(
            failure_reason,
            sample_accesion_code=job_context["sample"].accession_code,
            processor_job=job_context["job_id"],
        )
        job_context["job"].failure_reason = failure_reason
        job_context["job"].no_retry = True
        job_context["success"] = False
        return job_context

    job_context["time_end"] = timezone.now()

    if completed_command.returncode == 1:
        stderr = completed_command.stderr.decode().strip()
        error_start = stderr.upper().find("ERROR:")
        error_start = error_start if error_start != -1 else 0
        logger.error(
            "Shell call to salmon failed with error message: %s",
            stderr[error_start:],
            processor_job=job_context["job_id"],
        )

        # If salmon has an error exit code then we don't want to retry it.
        job_context["job"].no_retry = True
        job_context["job"].failure_reason = (
            "Shell call to salmon failed because: " + stderr[error_start:])
        job_context["success"] = False
    else:
        result = ComputationalResult()
        result.commands.append(formatted_command)
        result.time_start = job_context["time_start"]
        result.time_end = job_context["time_end"]
        result.organism_index = job_context["organism_index"]
        result.is_ccdl = True

        try:
            processor_key = "SALMON_QUANT"
            result.processor = utils.find_processor(processor_key)
        except Exception as e:
            return utils.handle_processor_exception(job_context, processor_key,
                                                    e)

        # Zip up the output of Salmon Quant
        try:
            with tarfile.open(job_context["output_archive"], "w:gz") as tar:
                tar.add(job_context["output_directory"], arcname=os.sep)
        except Exception:
            logger.exception(
                "Exception caught while zipping processed directory %s",
                job_context["output_directory"],
                processor_job=job_context["job_id"],
            )
            failure_template = "Exception caught while zipping processed directory {}"
            job_context["job"].failure_reason = failure_template.format(
                job_context["output_archive"])
            job_context["success"] = False
            return job_context

        salmon_quant_archive = ComputedFile()
        salmon_quant_archive.absolute_file_path = job_context["output_archive"]
        salmon_quant_archive.filename = os.path.split(
            job_context["output_archive"])[-1]
        salmon_quant_archive.calculate_sha1()
        salmon_quant_archive.calculate_size()
        salmon_quant_archive.is_public = True
        salmon_quant_archive.is_smashable = False
        salmon_quant_archive.is_qc = False

        quant_file = ComputedFile()
        quant_file.s3_bucket = S3_BUCKET_NAME
        timestamp = str(timezone.now().timestamp()).split(".")[0]
        quant_file.s3_key = "quant_files/sample_{0}_{1}_quant.sf".format(
            job_context["sample"].id, timestamp)
        quant_file.filename = "quant.sf"
        quant_file.absolute_file_path = job_context[
            "output_directory"] + "quant.sf"
        quant_file.is_public = False
        quant_file.is_smashable = False
        quant_file.is_qc = False
        quant_file.calculate_sha1()
        quant_file.calculate_size()

        # If we're running in the cloud we need to upload the quant.sf
        # file so that it can be used by a job running on any machine
        # to run tximport. We can't use sync_to_s3 though because we
        # have to sync it before we can save the file so it cannot be
        # discovered by other jobs before it is uploaded.
        if settings.RUNNING_IN_CLOUD:
            try:
                S3.upload_file(
                    quant_file.absolute_file_path,
                    quant_file.s3_bucket,
                    quant_file.s3_key,
                    ExtraArgs={
                        "ACL": "public-read",
                        "StorageClass": "STANDARD_IA"
                    },
                )
            except Exception as e:
                logger.exception(e,
                                 processor_job=job_context["job_id"],
                                 sample=job_context["sample"].id)
                failure_template = "Exception caught while uploading quantfile to S3: {}"
                job_context["job"].failure_reason = failure_template.format(
                    quant_file.absolute_file_path)
                job_context["success"] = False
                return job_context

        # Here select_for_update() is used as a mutex that forces multiple
        # jobs to execute this block of code in serial manner. See:
        # https://docs.djangoproject.com/en/1.11/ref/models/querysets/#select-for-update
        # Theorectically any rows in any table can be locked here, we're
        # locking all existing rows in ComputationalResult table.
        with transaction.atomic():
            ComputationalResult.objects.select_for_update()
            result.save()
            job_context["quant_result"] = result
            quant_file.result = result
            quant_file.save()

            job_context["result"] = result

            job_context["pipeline"].steps.append(result.id)
            SampleResultAssociation.objects.get_or_create(
                sample=job_context["sample"], result=result)

            salmon_quant_archive.result = result
            salmon_quant_archive.save()
            job_context["computed_files"].append(salmon_quant_archive)

        kv = ComputationalResultAnnotation()
        kv.data = {
            "index_length": job_context["index_length"],
            "index_length_get": job_context.get("index_length_raw", None),
        }
        kv.result = result
        kv.is_public = True
        kv.save()

        try:
            with open(
                    os.path.join(job_context["output_directory"],
                                 "lib_format_counts.json")) as lfc_file:
                format_count_data = json.load(lfc_file)
                kv = ComputationalResultAnnotation()
                kv.data = format_count_data
                kv.result = result
                kv.is_public = True
                kv.save()
        except Exception:
            # See: https://github.com/AlexsLemonade/refinebio/issues/1167
            logger.exception(
                "Error parsing Salmon lib_format_counts JSON output!",
                processor_job=job_context["job_id"],
            )

        try:
            with open(
                    os.path.join(job_context["output_directory"], "aux_info",
                                 "meta_info.json")) as mi_file:
                meta_info = json.load(mi_file)
                kv = ComputationalResultAnnotation()
                kv.data = meta_info
                kv.result = result
                kv.is_public = True
                kv.save()
        except Exception:
            # See: https://github.com/AlexsLemonade/refinebio/issues/1167
            logger.exception("Error parsing Salmon meta_info JSON output!",
                             processor_job=job_context["job_id"])

        job_context["success"] = True

    return job_context

コード例 #13

ファイルを表示

ファイル: salmon.py プロジェクト: arjunkrish/refinebio

def _run_tximport_for_experiment(job_context: Dict, experiment: Experiment,
                                 quant_files: List[ComputedFile]) -> Dict:

    # Download all the quant.sf fles for this experiment. Write all
    # their paths to a file so we can pass a path to that to
    # tximport.R rather than having to pass in one argument per
    # sample.
    tximport_path_list_file = job_context["work_dir"] + "tximport_inputs.txt"
    quant_file_paths = {}
    with open(tximport_path_list_file, "w") as input_list:
        for quant_file in quant_files:
            # We create a directory in the work directory for each (quant.sf) file, as
            # tximport assigns column names based on the parent directory name,
            # and we need those names so that we can reassociate withe samples later.
            # ex., a file with absolute_file_path: /processor_job_1/SRR123_output/quant.sf
            # downloads to: /processor_job_2/SRR123_output/quant.sf
            # So the result file has frame "SRR123_output", which we can associate with sample SRR123
            sample_output = (
                job_context["work_dir"] +
                str(quant_file.absolute_file_path.split("/")[-2]) + "/")
            os.makedirs(sample_output, exist_ok=True)
            quant_work_path = sample_output + quant_file.filename
            quant_file_path = quant_file.get_synced_file_path(
                path=quant_work_path)
            input_list.write(quant_file_path + "\n")
            quant_file_paths[quant_file_path] = os.stat(
                quant_file_path).st_size

    rds_filename = "txi_out.RDS"
    rds_file_path = job_context["work_dir"] + rds_filename
    tpm_filename = "gene_lengthScaledTPM.tsv"
    tpm_file_path = job_context["work_dir"] + tpm_filename
    result = ComputationalResult()
    cmd_tokens = [
        "/usr/bin/Rscript",
        "--vanilla",
        "/home/user/data_refinery_workers/processors/tximport.R",
        "--file_list",
        tximport_path_list_file,
        "--gene2txmap",
        job_context["genes_to_transcripts_path"],
        "--rds_file",
        rds_file_path,
        "--tpm_file",
        tpm_file_path,
    ]
    result.time_start = timezone.now()

    logger.debug(
        "Running tximport with: %s",
        str(cmd_tokens),
        processor_job=job_context["job_id"],
        experiment=experiment.id,
    )

    try:
        tximport_result = subprocess.run(cmd_tokens,
                                         stdout=subprocess.PIPE,
                                         stderr=subprocess.PIPE)
    except Exception as e:
        raise utils.ProcessorJobError(
            "Encountered error in R code while running tximport.R: {}".format(
                str(e)),
            success=False,
            experiment=experiment.id,
        )

    if tximport_result.returncode != 0:
        raise utils.ProcessorJobError(
            "Found non-zero exit code from R code while running tximport.R: {}"
            .format(tximport_result.stderr.decode().strip()),
            success=False,
            experiment=experiment.id,
            quant_files=quant_files,
            cmd_tokens=cmd_tokens,
            quant_file_paths=quant_file_paths,
        )

    result.time_end = timezone.now()
    result.commands.append(" ".join(cmd_tokens))
    result.is_ccdl = True
    try:
        processor_key = "TXIMPORT"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        raise utils.ProcessorJobError("Failed to set processor: {}".format(e),
                                      success=False,
                                      processor_key=processor_key)

    result.save()
    job_context["pipeline"].steps.append(result.id)

    rds_file = ComputedFile()
    rds_file.absolute_file_path = rds_file_path
    rds_file.filename = rds_filename
    rds_file.result = result
    rds_file.is_smashable = False
    rds_file.is_qc = False
    rds_file.is_public = True
    rds_file.calculate_sha1()
    rds_file.calculate_size()
    rds_file.save()
    job_context["computed_files"].append(rds_file)

    # Split the tximport result into smashable subfiles
    data = pd.read_csv(tpm_file_path, sep="\t", header=0, index_col=0)
    individual_files = []
    frames = np.split(data, len(data.columns), axis=1)
    for frame in frames:
        # Create sample-specific TPM file.
        sample_file_name = frame.columns.values[0] + "_" + tpm_filename
        frame_path = os.path.join(job_context["work_dir"], sample_file_name)
        frame.to_csv(frame_path, sep="\t", encoding="utf-8")

        # The frame column header is based off of the path, which includes _output.
        sample_accession_code = frame.columns.values[0].replace("_output", "")
        sample = Sample.objects.get(accession_code=sample_accession_code)

        computed_file = ComputedFile()
        computed_file.absolute_file_path = frame_path
        computed_file.filename = sample_file_name
        computed_file.result = result
        computed_file.is_smashable = True
        computed_file.is_qc = False
        computed_file.is_public = True
        computed_file.calculate_sha1()
        computed_file.calculate_size()
        computed_file.save()
        job_context["computed_files"].append(computed_file)
        job_context["smashable_files"].append(computed_file)

        SampleResultAssociation.objects.get_or_create(sample=sample,
                                                      result=result)

        # Create association with the RDS file.
        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample, computed_file=rds_file)

        # Create association with TPM file.
        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample, computed_file=computed_file)

        individual_files.append(computed_file)
        job_context["samples"].append(sample)

    # Salmon-processed samples aren't marked as is_processed
    # until they are fully tximported, this value sets that
    # for the end_job function.
    job_context["tximported"] = True
    job_context["individual_files"] = individual_files
    return job_context

コード例 #14

ファイルを表示

ファイル: salmon.py プロジェクト: Quiltomics/refinebio

def _run_multiqc(job_context: Dict) -> Dict:
    """Runs the `MultiQC` package to generate the QC report.

    TODO: These seem to consume a lot of RAM, even for small files.
    We should consider tuning these or breaking them out into their
    own processors. JVM settings may reduce RAM footprint.
    """
    command_str = ("multiqc {input_directory} --outdir {qc_directory} --zip-data-dir")
    formatted_command = command_str.format(input_directory=job_context["qc_input_directory"],
                                           qc_directory=job_context["qc_directory"])

    logger.debug("Running MultiQC using the following shell command: %s",
                formatted_command,
                processor_job=job_context["job_id"])

    qc_env = os.environ.copy()
    qc_env["LC_ALL"] = "C.UTF-8"
    qc_env["LANG"] = "C.UTF-8"

    time_start = timezone.now()
    completed_command = subprocess.run(formatted_command.split(),
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE,
                                       env=qc_env)
    time_end = timezone.now()

    if completed_command.returncode != 0:

        stderr = completed_command.stderr.decode().strip()
        error_start = stderr.upper().find("ERROR:")
        error_start = error_start if error_start != -1 else 0
        logger.error("Shell call to MultiQC failed with error message: %s",
                     stderr[error_start:],
                     processor_job=job_context["job_id"])

        job_context["job"].failure_reason = ("Shell call to MultiQC failed because: "
                                             + stderr[error_start:])
        job_context["success"] = False

    result = ComputationalResult()
    result.commands.append(formatted_command)
    result.time_start = time_start
    result.time_end = time_end
    result.is_ccdl = True

    try:
        processor_key = "MULTIQC"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)

    result.save()
    job_context['pipeline'].steps.append(result.id)

    assoc = SampleResultAssociation()
    assoc.sample = job_context["sample"]
    assoc.result = result
    assoc.save()

    job_context['qc_result'] = result

    data_file = ComputedFile()
    data_file.filename = "multiqc_data.zip" # This is deterministic
    data_file.absolute_file_path = os.path.join(job_context["qc_directory"], data_file.filename)
    data_file.calculate_sha1()
    data_file.calculate_size()
    data_file.is_public = True
    data_file.result = job_context['qc_result']
    data_file.is_smashable = False
    data_file.is_qc = True
    data_file.save()
    job_context['computed_files'].append(data_file)

    SampleComputedFileAssociation.objects.get_or_create(
        sample=job_context["sample"],
        computed_file=data_file)

    report_file = ComputedFile()
    report_file.filename = "multiqc_report.html" # This is deterministic
    report_file.absolute_file_path = os.path.join(job_context["qc_directory"], report_file.filename)
    report_file.calculate_sha1()
    report_file.calculate_size()
    report_file.is_public = True
    report_file.is_smashable = False
    report_file.is_qc = True
    report_file.result = job_context['qc_result']
    report_file.save()
    job_context['computed_files'].append(report_file)

    job_context['qc_files'] = [data_file, report_file]

    return job_context

コード例 #15

ファイルを表示

ファイル: create_quantpendia.py プロジェクト: erflynn/refinebio

def _create_result_objects(job_context: Dict) -> Dict:
    """
    Store and host the result as a ComputationalResult object.
    """
    archive_path = job_context["archive_path"]
    compendia_organism = _get_organisms(job_context["samples"]).first()
    compendia_version = _get_next_compendia_version(compendia_organism)

    result = ComputationalResult()
    result.commands.append(" ".join(job_context["formatted_command"]))
    result.is_ccdl = True
    result.is_public = True
    result.time_start = job_context["time_start"]
    result.time_end = job_context["time_end"]
    try:
        processor_key = "CREATE_QUANTPENDIA"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)
    result.save()

    archive_computed_file = ComputedFile()
    archive_computed_file.absolute_file_path = archive_path
    archive_computed_file.filename = FileUtils.get_filename(archive_path)
    archive_computed_file.calculate_sha1()
    archive_computed_file.calculate_size()
    archive_computed_file.is_smashable = False
    archive_computed_file.is_qn_target = False
    archive_computed_file.result = result
    archive_computed_file.is_compendia = True
    archive_computed_file.quant_sf_only = True
    archive_computed_file.compendia_organism = compendia_organism
    archive_computed_file.compendia_version = compendia_version
    archive_computed_file.save()

    compendium_result = CompendiumResult()
    compendium_result.quant_sf_only = True
    compendium_result.result = result
    compendium_result.primary_organism = compendia_organism
    compendium_result.compendium_version = compendia_version
    compendium_result.save()

    logger.info(
        "Quantpendia created! Uploading to S3.",
        job_id=job_context["job_id"],
        archive_path=archive_path,
        organism_name=compendia_organism.name,
        **get_process_stats()
    )

    # Upload the result to S3
    timestamp = str(int(time.time()))
    s3_key = compendia_organism.name + "_" + str(compendia_version) + "_" + timestamp + ".zip"
    uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, s3_key)

    if not uploaded_to_s3:
        raise utils.ProcessorJobError(
            "Failed to upload compendia to S3",
            success=False,
            computed_file_id=archive_computed_file.id,
        )

    if settings.RUNNING_IN_CLOUD:
        archive_computed_file.delete_local_file()

    job_context["result"] = result
    job_context["success"] = True

    return job_context

コード例 #16

ファイルを表示

def _populate_index_object(job_context: Dict) -> Dict:
    """ """

    result = ComputationalResult()
    result.commands.append(job_context["salmon_formatted_command"])
    try:
        processor_key = "TX_INDEX"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)

    result.is_ccdl = True
    result.time_start = job_context["time_start"]
    result.time_end = job_context["time_end"]
    result.save()
    job_context['pipeline'].steps.append(result.id)

    computed_file = ComputedFile()
    computed_file.absolute_file_path = job_context["computed_archive"]
    computed_file.filename = os.path.split(job_context["computed_archive"])[-1]
    computed_file.calculate_sha1()
    computed_file.calculate_size()
    computed_file.result = result
    computed_file.is_smashable = False
    computed_file.is_qc = False
    computed_file.save()

    organism_object = Organism.get_object_for_name(job_context['organism_name'])
    index_object = OrganismIndex()
    index_object.organism = organism_object
    index_object.source_version = job_context["assembly_version"]
    index_object.assembly_name = job_context["assembly_name"]
    index_object.salmon_version = job_context["salmon_version"]
    index_object.index_type = "TRANSCRIPTOME_" + job_context['length'].upper()
    # This is where the index will be extracted to.
    index_object.absolute_directory_path = LOCAL_ROOT_DIR + "/TRANSCRIPTOME_INDEX/" \
                                           + organism_object.name + "/" + job_context['length']
    index_object.result = result

    if S3_TRANSCRIPTOME_INDEX_BUCKET_NAME:
        logger.info("Uploading %s %s to s3", job_context['organism_name'], job_context['length'], processor_job=job_context["job_id"])
        timestamp = str(timezone.now().timestamp()).split('.')[0]
        s3_key = organism_object.name + '_' + index_object.index_type + "_" + timestamp + '.tar.gz'
        sync_result = computed_file.sync_to_s3(S3_TRANSCRIPTOME_INDEX_BUCKET_NAME, s3_key)
        if sync_result:
            computed_file.delete_local_file()
    else:
        logger.warn("S3_TRANSCRIPTOME_INDEX_BUCKET_NAME not configured, therefore %s %s will not be uploaded.",
                    job_context['organism_name'],
                    job_context['length'],
                    processor_job=job_context["job_id"])

    index_object.save()

    # We uploaded the file ourselves since we wanted it to go to a
    # different bucket than end_job would put it in, therefore empty
    # this list so end_job doesn't try to upload it again.
    job_context['computed_files'] = []

    job_context['result'] = result
    job_context['computed_file'] = computed_file
    job_context['index'] = index_object

    # If there's not a long and a short index for this organism yet,
    # don't delete the input.
    # XXX: This will break once we introduce additional versions of these.
    short_indices = OrganismIndex.objects.filter(organism=organism_object,
                                                 index_type="TRANSCRIPTOME_SHORT",
                                                 source_version=job_context["assembly_version"])
    long_indices = OrganismIndex.objects.filter(organism=organism_object,
                                                 index_type="TRANSCRIPTOME_LONG",
                                                 source_version=job_context["assembly_version"])
    if short_indices.count() < 1 or long_indices.count() < 1:
        # utils.end_job deletes these, so remove them so it doesn't.
        job_context["original_files"] = []

    return job_context

コード例 #17

ファイルを表示

ファイル: salmon.py プロジェクト: Quiltomics/refinebio

def _tximport(job_context: Dict, experiment: Experiment, quant_files: List[ComputedFile]) -> Dict:
    """Run tximport R script based on input quant files and the path
    of genes_to_transcripts.txt.
    """
    # Download all the quant.sf fles for this experiment. Write all
    # their paths to a file so we can pass a path to that to
    # tximport.R rather than having to pass in one argument per
    # sample.
    tximport_path_list_file = job_context["work_dir"] + "tximport_inputs.txt"
    with open(tximport_path_list_file, "w") as input_list:
        for quant_file in quant_files:
            input_list.write(quant_file.get_synced_file_path() + "\n")

    rds_filename = "txi_out.RDS"
    rds_file_path = job_context["work_dir"] + rds_filename
    tpm_filename = "gene_lengthScaledTPM.tsv"
    tpm_file_path = job_context["work_dir"] + tpm_filename
    result = ComputationalResult()
    cmd_tokens = [
        "/usr/bin/Rscript", "--vanilla",
        "/home/user/data_refinery_workers/processors/tximport.R",
        "--file_list", tximport_path_list_file,
        "--gene2txmap", job_context["genes_to_transcripts_path"],
        "--rds_file", rds_file_path,
        "--tpm_file", tpm_file_path
    ]
    result.time_start = timezone.now()

    logger.debug("Running tximport with: %s",
                 str(cmd_tokens),
                 processor_job=job_context['job_id'],
                 experiment=experiment.id)

    try:
        tximport_result = subprocess.run(cmd_tokens, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except Exception as e:
        error_template = ("Encountered error in R code while running tximport.R: {}")
        error_message = error_template.format(str(e))
        logger.error(error_message, processor_job=job_context["job_id"], experiment=experiment.id)
        job_context["job"].failure_reason = error_message
        job_context["success"] = False
        return job_context

    if tximport_result.returncode != 0:
        error_template = ("Found non-zero exit code from R code while running tximport.R: {}")
        error_message = error_template.format(tximport_result.stderr.decode().strip())
        logger.error(error_message, processor_job=job_context["job_id"], experiment=experiment.id)
        job_context["job"].failure_reason = error_message
        job_context["success"] = False
        return job_context

    result.time_end = timezone.now()
    result.commands.append(" ".join(cmd_tokens))
    result.is_ccdl = True
    try:
        processor_key = "TXIMPORT"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)

    result.save()
    job_context['pipeline'].steps.append(result.id)

    # Associate this result with all samples in this experiment.
    # TODO: This may not be completely sensible, because `tximport` is
    # done at experiment level, not at sample level.
    # Could be very problematic if SRA's data model allows many
    # Experiments to one Run.
    # https://github.com/AlexsLemonade/refinebio/issues/297
    for sample in experiment.samples.all():
        s_r = SampleResultAssociation(sample=sample, result=result)
        s_r.save()

    rds_file = ComputedFile()
    rds_file.absolute_file_path = rds_file_path
    rds_file.filename = rds_filename
    rds_file.result = result
    rds_file.is_smashable = False
    rds_file.is_qc = False
    rds_file.is_public = True
    rds_file.calculate_sha1()
    rds_file.calculate_size()
    rds_file.save()
    job_context['computed_files'].append(rds_file)

    # Split the tximport result into smashable subfiles
    data = pd.read_csv(tpm_file_path, sep='\t', header=0, index_col=0)
    individual_files = []
    frames = np.split(data, len(data.columns), axis=1)
    for frame in frames:
        # Create sample-specific TPM file.
        sample_file_name = frame.columns.values[0] + '_' + tpm_filename
        frame_path = os.path.join(job_context["work_dir"], sample_file_name)
        frame.to_csv(frame_path, sep='\t', encoding='utf-8')

        # The frame column header is based off of the path, which includes _output.
        sample = Sample.objects.get(accession_code=frame.columns.values[0].replace("_output", ""))

        computed_file = ComputedFile()
        computed_file.absolute_file_path = frame_path
        computed_file.filename = sample_file_name
        computed_file.result = result
        computed_file.is_smashable = True
        computed_file.is_qc = False
        computed_file.is_public = True
        computed_file.calculate_sha1()
        computed_file.calculate_size()
        computed_file.save()
        job_context['computed_files'].append(computed_file)
        job_context['smashable_files'].append(computed_file)

        SampleResultAssociation.objects.get_or_create(
            sample=sample,
            result=result)

        # Create association with the RDS file.
        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample,
            computed_file=rds_file)

        # Create association with TPM file.
        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample,
            computed_file=computed_file)

        individual_files.append(computed_file)
        job_context['samples'].append(sample)

    # Clean up quant.sf files that were created just for this.
    for quant_file in quant_files:
        quant_file.delete_s3_file()

        # It's only okay to delete the local file because the full
        # output directory has already been zipped up.
        quant_file.delete_local_file()
        quant_file.delete()

    # Salmon-processed samples aren't marked as is_processed
    # until they are fully tximported, this value sets that
    # for the end_job function.
    job_context['tximported'] = True
    job_context['individual_files'] = individual_files
    return job_context