Esempio n. 1
0
def _smash_all(job_context: Dict) -> Dict:
    """Perform smashing on all species/experiments in the dataset.
    """
    start_smash = log_state("start smash", job_context["job"].id)

    job_context["unsmashable_files"] = []
    job_context["num_samples"] = 0

    # Smash all of the sample sets
    logger.debug(
        "About to smash!",
        dataset_count=len(job_context["dataset"].data),
        job_id=job_context["job"].id,
    )

    try:
        # Once again, `key` is either a species name or an experiment accession
        for key, input_files in job_context.pop("input_files").items():
            job_context = _smash_key(job_context, key, input_files)
    except Exception as e:
        raise utils.ProcessorJobError(
            "Could not smash dataset: " + str(e),
            success=False,
            dataset_id=job_context["dataset"].id,
            num_input_files=job_context["num_input_files"],
        )

    smashing_utils.write_non_data_files(job_context)

    # Finally, compress all files into a zip
    final_zip_base = "/home/user/data_store/smashed/" + str(
        job_context["dataset"].pk)
    try:
        shutil.make_archive(final_zip_base, "zip", job_context["output_dir"])
    except OSError:
        raise utils.ProcessorJobError("Smash Error while generating zip file",
                                      success=False)

    job_context["output_file"] = final_zip_base + ".zip"

    job_context["dataset"].success = True
    job_context["dataset"].save()

    logger.debug("Created smash output!",
                 archive_location=job_context["output_file"])

    log_state("end smash", job_context["job"].id, start_smash)
    return job_context
Esempio n. 2
0
def _upload(job_context: Dict) -> Dict:
    """Uploads the result file to S3 and notifies user."""
    if not job_context.get("upload", True) or not settings.RUNNING_IN_CLOUD:
        return job_context

    s3_client = boto3.client("s3")
    output_filename = job_context["output_file"].split("/")[-1]

    try:
        # Note that file expiry is handled by the S3 object lifecycle,
        # managed by terraform.
        s3_client.upload_file(
            job_context["output_file"],
            RESULTS_BUCKET,
            output_filename,
        )
    except Exception:
        raise utils.ProcessorJobError("Failed to upload smash result file.",
                                      success=False,
                                      file=job_context["output_file"])

    result_url = "https://s3.amazonaws.com/" + RESULTS_BUCKET + "/" + output_filename

    job_context["result_url"] = result_url

    logger.debug("Result uploaded!", result_url=job_context["result_url"])

    return job_context
Esempio n. 3
0
def write_non_data_files(job_context: Dict) -> Dict:
    """Writes the files that are not the actual data of the dataset.

    This include LICENSE.txt and README.md files and the metadata.

    Adds the key `metadata` to job_context and populates it with all
    the metadata that needs to be written.
    """
    job_context["metadata"] = compile_metadata(job_context)

    shutil.copy("README_DATASET.md", job_context["output_dir"] + "README.md")
    shutil.copy("LICENSE_DATASET.txt",
                job_context["output_dir"] + "LICENSE.TXT")

    # Write samples metadata to TSV
    try:
        write_tsv_json(job_context)
        # Metadata to JSON
        job_context["metadata"]["created_at"] = timezone.now().strftime(
            "%Y-%m-%dT%H:%M:%S")
        aggregated_metadata_path = os.path.join(job_context["output_dir"],
                                                "aggregated_metadata.json")
        with open(aggregated_metadata_path, "w",
                  encoding="utf-8") as metadata_file:
            json.dump(job_context["metadata"],
                      metadata_file,
                      indent=4,
                      sort_keys=True)

        if job_context["filtered_samples"]:
            # generate filtered samples file only if some samples were skipped
            filtered_samples_path = os.path.join(
                job_context["output_dir"], "filtered_samples_metadata.json")
            with open(filtered_samples_path, "w",
                      encoding="utf-8") as metadata_file:
                json.dump(job_context["filtered_samples"],
                          metadata_file,
                          indent=4,
                          sort_keys=True)

            columns = get_tsv_columns(job_context["filtered_samples"])
            filtered_samples_tsv_path = os.path.join(
                job_context["output_dir"], "filtered_samples_metadata.tsv")
            with open(filtered_samples_tsv_path, "w",
                      encoding="utf-8") as tsv_file:
                dw = csv.DictWriter(tsv_file,
                                    columns,
                                    delimiter="\t",
                                    extrasaction="ignore")
                dw.writeheader()
                for sample_metadata in job_context["filtered_samples"].values(
                ):
                    dw.writerow(
                        get_tsv_row_data(sample_metadata,
                                         job_context["dataset"].data))
    except Exception:
        raise utils.ProcessorJobError("Failed to write metadata TSV!",
                                      success=False)

    return job_context
Esempio n. 4
0
def prepare_files(job_context: Dict) -> Dict:
    """
    Fetches and prepares the files to smash.
    """
    start_prepare_files = log_state("start prepare files",
                                    job_context["job"].id)
    found_files = False
    job_context["filtered_samples"] = {}
    job_context["input_files"] = {}

    # `key` can either be the species name or experiment accession.
    for key, samples in job_context["samples"].items():
        smashable_files = []
        seen_files = set()
        for sample in samples:
            if job_context["dataset"].quant_sf_only:
                # For quant.sf only jobs, just check that they have a quant.sf file
                smashable_file = sample.get_most_recent_quant_sf_file()
            else:
                smashable_file = sample.get_most_recent_smashable_result_file()

            if smashable_file is not None and smashable_file not in seen_files:
                smashable_files = smashable_files + [(smashable_file, sample)]
                seen_files.add(smashable_file)
                found_files = True
            else:
                sample_metadata = sample.to_metadata_dict()
                job_context["filtered_samples"][sample.accession_code] = {
                    **sample_metadata,
                    "reason":
                    "This sample did not have a processed file associated with it in our database.",
                    "experiment_accession_code":
                    get_experiment_accession(sample.accession_code,
                                             job_context["dataset"].data),
                }

        job_context["input_files"][key] = smashable_files

    job_context["num_input_files"] = len(job_context["input_files"])
    job_context["group_by_keys"] = list(job_context["input_files"].keys())

    if not found_files:
        raise utils.ProcessorJobError(
            "Couldn't get any files to smash for Smash job!!",
            success=False,
            dataset_id=job_context["dataset"].id,
            num_samples=len(job_context["samples"]),
        )

    dataset_id = str(job_context["dataset"].pk)
    job_context[
        "work_dir"] = "/home/user/data_store/smashed/" + dataset_id + "/"
    # Ensure we have a fresh smash directory
    shutil.rmtree(job_context["work_dir"], ignore_errors=True)
    os.makedirs(job_context["work_dir"])

    job_context["output_dir"] = job_context["work_dir"] + "output/"
    os.makedirs(job_context["output_dir"])
    log_state("end prepare files", job_context["job"].id, start_prepare_files)
    return job_context
Esempio n. 5
0
def quantile_normalize(job_context: Dict, ks_stat=0.001) -> Dict:
    """
    Apply quantile normalization.
    """
    # Prepare our QN target file
    organism = job_context["organism"]

    if not organism.qn_target:
        raise utils.ProcessorJobError(
            "Could not find QN target for Organism: " + str(organism),
            success=False,
            organism=organism,
            dataset_id=job_context["dataset"].id,
        )

    qn_target_path = organism.qn_target.computedfile_set.latest().sync_from_s3()
    qn_target_frame = pd.read_csv(
        qn_target_path, sep="\t", header=None, index_col=None, error_bad_lines=False
    )

    # Prepare our RPy2 bridge
    pandas2ri.activate()

    # Remove un-quantiled normalized matrix from job_context
    # because we no longer need it.
    merged_no_qn = job_context.pop("merged_no_qn")

    # Perform the Actual QN
    new_merged = _quantile_normalize_matrix(qn_target_frame[0], merged_no_qn)

    # And add the quantile normalized matrix to job_context.
    job_context["merged_qn"] = new_merged

    ks_res = _test_qn(new_merged)
    if ks_res:
        for (statistic, pvalue) in ks_res:
            job_context["ks_statistic"] = statistic
            job_context["ks_pvalue"] = pvalue

            # We're unsure of how strigent to be about
            # the pvalue just yet, so we're extra lax
            # rather than failing tons of tests. This may need tuning.
            if statistic > ks_stat or pvalue < 0.8:
                job_context["ks_warning"] = (
                    "Failed Kolmogorov Smirnov test! Stat: "
                    + str(statistic)
                    + ", PVal: "
                    + str(pvalue)
                )
    else:
        logger.warning(
            "Not enough columns to perform KS test - either bad smash or single sample smash.",
            dataset_id=job_context["dataset"].id,
        )

    return job_context
Esempio n. 6
0
def _notify(job_context: Dict) -> Dict:
    """Use AWS SES to notify a user of a smash result.."""

    if not job_context.get("upload", True) or not settings.RUNNING_IN_CLOUD:
        return job_context

    # Send a notification to slack when a dataset fails to be processed
    if job_context["job"].success is False:
        try:
            _notify_slack_failed_dataset(job_context)
        except Exception as e:
            logger.warn(e)  # It doesn't really matter if this didn't work

    # Don't send an email if we don't have address or the user doesn't want an email.
    if job_context["dataset"].email_address and job_context[
            "dataset"].notify_me:
        # Try to send the email.
        try:
            _notify_send_email(job_context)
        # Display an error if something goes wrong.
        except ClientError as e:
            raise utils.ProcessorJobError(
                "ClientError while notifying",
                success=False,
                exc_info=1,
                client_error_message=e.response["Error"]["Message"],
            )
        except Exception:
            raise utils.ProcessorJobError(
                "General failure when trying to send email.",
                success=False,
                exc_info=1,
                result_url=job_context["result_url"],
            )

    # We don't want to retry this dataset after we send a notification to users
    # https://github.com/alexslemonade/refinebio/issues/1944
    job_context["job"].no_retry = True
    job_context["job"].save()

    return job_context
Esempio n. 7
0
def set_tximport_inputs(job_context: Dict) -> Dict:
    """Adds to the job_context a mapping from experiments to a list of their quant files.

    Checks all the experiments which contain a sample from the current
    experiment. If any of them are fully processed (at least with
    salmon-quant) then the return dict will include the experiment
    mapping to a list of paths to the quant.sf file for each sample in
    that experiment.
    """
    experiments = job_context["sample"].experiments.all()

    quantified_experiments = {}
    for experiment in experiments:
        # We only want to consider samples that we actually can run salmon on.
        eligible_samples = experiment.samples.filter(source_database="SRA",
                                                     technology="RNA-SEQ")
        if not eligible_samples.exists():
            continue

        is_tximport_job = "is_tximport_only" in job_context and job_context[
            "is_tximport_only"]
        salmon_quant_files = get_tximport_inputs_if_eligible(
            experiment, is_tximport_job)

        if is_tximport_job and salmon_quant_files:
            # If the job is only running tximport, then index_length
            # hasn't been set on the job context because we don't have
            # a raw file to run it on. Therefore pull it from one of
            # the result annotations.

            # Can't just do salmon_quant_results[0] because it's a set.
            index_length = salmon_quant_files[0].result.get_index_length()
            if index_length:
                job_context["index_length"] = index_length
            elif "index_length" not in job_context:
                raise utils.ProcessorJobError(
                    ("Found quant result without an annotation specifying its index length."
                     " Why did this happen?!?"),
                    success=False,
                    no_retry=True,
                )

        if salmon_quant_files:
            quantified_experiments[experiment] = salmon_quant_files

    job_context["tximport_inputs"] = quantified_experiments

    return job_context
def _prepare_frames(job_context: Dict) -> Dict:
    """
    Takes the inputs and places them into matrices
     - Combine all microarray samples with a full join to form a
       DataFrame `microarray_expression_matrix`.
     - Combine all RNA-seq samples (lengthScaledTPM) with a full outer join
       to form a DataFrame `rnaseq_expression_matrix`.
     - Adds unsmashable files to `job_context["unsmashable_files"]`
    """
    start_prepare_frames = log_state("start _prepare_frames",
                                     job_context["job"].id)

    job_context["unsmashable_files"] = []
    job_context["num_samples"] = 0

    # Smash all of the sample sets
    logger.debug(
        "About to smash!",
        dataset_count=len(job_context["dataset"].data),
        job_id=job_context["job"].id,
    )

    try:
        # Once again, `key` is either a species name or an experiment accession
        for key, input_files in job_context.pop("input_files").items():
            job_context = smashing_utils.process_frames_for_key(
                key, input_files, job_context)
            # if len(job_context['all_frames']) < 1:
            # TODO: Enable this check?
    except Exception:
        raise utils.ProcessorJobError(
            "Could not prepare frames for compendia.",
            success=False,
            dataset_id=job_context["dataset"].id,
            processor_job_id=job_context["job_id"],
            num_input_files=job_context["num_input_files"],
        )

    job_context["dataset"].success = True
    job_context["dataset"].save()

    log_state("end _prepare_frames", job_context["job"].id,
              start_prepare_frames)
    return job_context
Esempio n. 9
0
def tximport(job_context: Dict) -> Dict:
    """Run tximport R script based on input quant files and the path
    of genes_to_transcripts.txt.
    """
    tximport_inputs = job_context["tximport_inputs"]

    quantified_experiments = 0
    for experiment, quant_files in tximport_inputs.items():
        job_context = _run_tximport_for_experiment(job_context, experiment,
                                                   quant_files)
        quantified_experiments += 1

    if (quantified_experiments == 0 and "is_tximport_job" in job_context
            and job_context["is_tximport_only"]):
        raise utils.ProcessorJobError(
            "Tximport job ran on no experiments... Why?!?!?",
            success=False,
            no_retry=True)

    return job_context
Esempio n. 10
0
def _prepare_frames(job_context: Dict) -> Dict:
    start_prepare_frames = log_state("start _prepare_frames",
                                     job_context["job"].id)

    job_context["unsmashable_files"] = []
    job_context["num_samples"] = 0

    # Smash all of the sample sets
    logger.debug(
        "About to smash!",
        dataset_count=len(job_context["dataset"].data),
        job_id=job_context["job"].id,
    )

    try:
        # Once again, `key` is either a species name or an experiment accession
        for key, input_files in job_context.pop("input_files").items():
            job_context = smashing_utils.process_frames_for_key(
                key, input_files, job_context)
            # if len(job_context['all_frames']) < 1:
            # TODO: Enable this check?
    except Exception:
        raise utils.ProcessorJobError(
            "Could not prepare frames for compendia.",
            success=False,
            dataset_id=job_context["dataset"].id,
            processor_job_id=job_context["job_id"],
            num_input_files=job_context["num_input_files"],
        )

    job_context["dataset"].success = True
    job_context["dataset"].save()

    log_state("end _prepare_frames", job_context["job"].id,
              start_prepare_frames)
    return job_context
Esempio n. 11
0
def _create_result_objects(job_context: Dict) -> Dict:
    """
    Store and host the result as a ComputationalResult object.
    """
    archive_path = job_context["archive_path"]
    compendia_organism = _get_organisms(job_context["samples"]).first()
    compendia_version = _get_next_compendia_version(compendia_organism)

    result = ComputationalResult()
    result.commands.append(" ".join(job_context["formatted_command"]))
    result.is_ccdl = True
    result.is_public = True
    result.time_start = job_context["time_start"]
    result.time_end = job_context["time_end"]
    try:
        processor_key = "CREATE_QUANTPENDIA"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)
    result.save()

    archive_computed_file = ComputedFile()
    archive_computed_file.absolute_file_path = archive_path
    archive_computed_file.filename = FileUtils.get_filename(archive_path)
    archive_computed_file.calculate_sha1()
    archive_computed_file.calculate_size()
    archive_computed_file.is_smashable = False
    archive_computed_file.is_qn_target = False
    archive_computed_file.result = result
    archive_computed_file.is_compendia = True
    archive_computed_file.quant_sf_only = True
    archive_computed_file.compendia_organism = compendia_organism
    archive_computed_file.compendia_version = compendia_version
    archive_computed_file.save()

    compendium_result = CompendiumResult()
    compendium_result.quant_sf_only = True
    compendium_result.result = result
    compendium_result.primary_organism = compendia_organism
    compendium_result.compendium_version = compendia_version
    compendium_result.save()

    logger.info(
        "Quantpendia created! Uploading to S3.",
        job_id=job_context["job_id"],
        archive_path=archive_path,
        organism_name=compendia_organism.name,
        **get_process_stats()
    )

    # Upload the result to S3
    timestamp = str(int(time.time()))
    s3_key = compendia_organism.name + "_" + str(compendia_version) + "_" + timestamp + ".zip"
    uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, s3_key)

    if not uploaded_to_s3:
        raise utils.ProcessorJobError(
            "Failed to upload compendia to S3",
            success=False,
            computed_file_id=archive_computed_file.id,
        )

    if settings.RUNNING_IN_CLOUD:
        archive_computed_file.delete_local_file()

    job_context["result"] = result
    job_context["success"] = True

    return job_context
Esempio n. 12
0
def _create_result_objects(job_context: Dict) -> Dict:
    """
    Store and host the result as a ComputationalResult object.
    """
    result_start = log_state("start create result object",
                             job_context["job"].id)
    result = ComputationalResult()
    result.commands.append(" ".join(job_context["formatted_command"]))
    result.is_ccdl = True
    # Temporary until we re-enable the QN test step.
    result.is_public = False
    result.time_start = job_context["time_start"]
    result.time_end = job_context["time_end"]
    try:
        processor_key = "CREATE_COMPENDIA"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)
    result.save()

    # Write the compendia dataframe to a file
    job_context["csv_outfile"] = job_context["output_dir"] + job_context[
        "organism_name"] + ".tsv"
    job_context["merged_qn"].to_csv(job_context["csv_outfile"],
                                    sep="\t",
                                    encoding="utf-8")

    organism_key = list(job_context["samples"].keys())[0]
    annotation = ComputationalResultAnnotation()
    annotation.result = result

    annotation.data = {
        "organism_id":
        job_context["samples"][organism_key][0].organism_id,
        "organism_name":
        job_context["organism_name"],
        "is_qn":
        False,
        "is_compendia":
        True,
        "samples": [
            sample.accession_code
            for sample in job_context["samples"][organism_key]
        ],
        "num_samples":
        len(job_context["samples"][organism_key]),
        "experiment_accessions":
        [e.accession_code for e in job_context["experiments"]],
        "total_percent_imputed":
        job_context["total_percent_imputed"],
    }
    annotation.save()

    # Create the resulting archive
    final_zip_base = SMASHING_DIR + str(
        job_context["dataset"].pk) + "_compendia"
    # Copy LICENSE.txt and correct README.md files.
    if job_context["dataset"].quant_sf_only:
        readme_file = "/home/user/README_QUANT.md"
    else:
        readme_file = "/home/user/README_NORMALIZED.md"

    shutil.copy(readme_file, job_context["output_dir"] + "/README.md")
    shutil.copy("/home/user/LICENSE_DATASET.txt",
                job_context["output_dir"] + "/LICENSE.TXT")
    archive_path = shutil.make_archive(final_zip_base, "zip",
                                       job_context["output_dir"])

    archive_computed_file = ComputedFile()
    archive_computed_file.absolute_file_path = archive_path
    archive_computed_file.filename = archive_path.split("/")[-1]
    archive_computed_file.calculate_sha1()
    archive_computed_file.calculate_size()
    archive_computed_file.is_smashable = False
    archive_computed_file.is_qn_target = False
    archive_computed_file.result = result
    archive_computed_file.save()

    # Compendia Result Helpers
    primary_organism = Organism.get_object_for_name(
        job_context["primary_organism"])
    organisms = [
        Organism.get_object_for_name(organism)
        for organism in job_context["all_organisms"]
    ]
    compendium_version = (CompendiumResult.objects.filter(
        primary_organism=primary_organism, quant_sf_only=False).count() + 1)
    # Save Compendia Result
    compendium_result = CompendiumResult()
    compendium_result.quant_sf_only = job_context["dataset"].quant_sf_only
    compendium_result.svd_algorithm = job_context["dataset"].svd_algorithm
    compendium_result.compendium_version = compendium_version
    compendium_result.result = result
    compendium_result.primary_organism = primary_organism
    compendium_result.save()

    # create relations to all organisms contained in the compendia

    compendium_result_organism_associations = []
    for compendium_organism in organisms:
        compendium_result_organism_association = CompendiumResultOrganismAssociation(
        )
        compendium_result_organism_association.compendium_result = compendium_result
        compendium_result_organism_association.organism = compendium_organism
        compendium_result_organism_associations.append(
            compendium_result_organism_association)

    CompendiumResultOrganismAssociation.objects.bulk_create(
        compendium_result_organism_associations)

    job_context["compendium_result"] = compendium_result

    logger.info("Compendium created!",
                archive_path=archive_path,
                organism_name=job_context["organism_name"])

    # Upload the result to S3
    timestamp = str(int(time.time()))
    key = job_context["organism_name"] + "_" + str(
        compendium_version) + "_" + timestamp + ".zip"
    uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME,
                                                      key)

    if not uploaded_to_s3:
        raise utils.ProcessorJobError(
            "Failed to upload compendia to S3",
            success=False,
            computed_file_id=archive_computed_file.id,
        )

    if settings.RUNNING_IN_CLOUD:
        archive_computed_file.delete_local_file()

    job_context["result"] = result
    job_context["success"] = True

    log_state("end create result object", job_context["job"].id, result_start)

    # TEMPORARY for iterating on compendia more quickly.
    # Reset this so the end_job does clean up the job's non-input-data stuff.
    job_context["work_dir"] = job_context["old_work_dir"]

    return job_context
Esempio n. 13
0
def _run_tximport_for_experiment(job_context: Dict, experiment: Experiment,
                                 quant_files: List[ComputedFile]) -> Dict:

    # Download all the quant.sf fles for this experiment. Write all
    # their paths to a file so we can pass a path to that to
    # tximport.R rather than having to pass in one argument per
    # sample.
    tximport_path_list_file = job_context["work_dir"] + "tximport_inputs.txt"
    quant_file_paths = {}
    with open(tximport_path_list_file, "w") as input_list:
        for quant_file in quant_files:
            # We create a directory in the work directory for each (quant.sf) file, as
            # tximport assigns column names based on the parent directory name,
            # and we need those names so that we can reassociate withe samples later.
            # ex., a file with absolute_file_path: /processor_job_1/SRR123_output/quant.sf
            # downloads to: /processor_job_2/SRR123_output/quant.sf
            # So the result file has frame "SRR123_output", which we can associate with sample SRR123
            sample_output = (
                job_context["work_dir"] +
                str(quant_file.absolute_file_path.split("/")[-2]) + "/")
            os.makedirs(sample_output, exist_ok=True)
            quant_work_path = sample_output + quant_file.filename
            quant_file_path = quant_file.get_synced_file_path(
                path=quant_work_path)
            input_list.write(quant_file_path + "\n")
            quant_file_paths[quant_file_path] = os.stat(
                quant_file_path).st_size

    rds_filename = "txi_out.RDS"
    rds_file_path = job_context["work_dir"] + rds_filename
    tpm_filename = "gene_lengthScaledTPM.tsv"
    tpm_file_path = job_context["work_dir"] + tpm_filename
    result = ComputationalResult()
    cmd_tokens = [
        "/usr/bin/Rscript",
        "--vanilla",
        "/home/user/data_refinery_workers/processors/tximport.R",
        "--file_list",
        tximport_path_list_file,
        "--gene2txmap",
        job_context["genes_to_transcripts_path"],
        "--rds_file",
        rds_file_path,
        "--tpm_file",
        tpm_file_path,
    ]
    result.time_start = timezone.now()

    logger.debug(
        "Running tximport with: %s",
        str(cmd_tokens),
        processor_job=job_context["job_id"],
        experiment=experiment.id,
    )

    try:
        tximport_result = subprocess.run(cmd_tokens,
                                         stdout=subprocess.PIPE,
                                         stderr=subprocess.PIPE)
    except Exception as e:
        raise utils.ProcessorJobError(
            "Encountered error in R code while running tximport.R: {}".format(
                str(e)),
            success=False,
            experiment=experiment.id,
        )

    if tximport_result.returncode != 0:
        raise utils.ProcessorJobError(
            "Found non-zero exit code from R code while running tximport.R: {}"
            .format(tximport_result.stderr.decode().strip()),
            success=False,
            experiment=experiment.id,
            quant_files=quant_files,
            cmd_tokens=cmd_tokens,
            quant_file_paths=quant_file_paths,
        )

    result.time_end = timezone.now()
    result.commands.append(" ".join(cmd_tokens))
    result.is_ccdl = True
    try:
        processor_key = "TXIMPORT"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        raise utils.ProcessorJobError("Failed to set processor: {}".format(e),
                                      success=False,
                                      processor_key=processor_key)

    result.save()
    job_context["pipeline"].steps.append(result.id)

    rds_file = ComputedFile()
    rds_file.absolute_file_path = rds_file_path
    rds_file.filename = rds_filename
    rds_file.result = result
    rds_file.is_smashable = False
    rds_file.is_qc = False
    rds_file.is_public = True
    rds_file.calculate_sha1()
    rds_file.calculate_size()
    rds_file.save()
    job_context["computed_files"].append(rds_file)

    # Split the tximport result into smashable subfiles
    data = pd.read_csv(tpm_file_path, sep="\t", header=0, index_col=0)
    individual_files = []
    frames = np.split(data, len(data.columns), axis=1)
    for frame in frames:
        # Create sample-specific TPM file.
        sample_file_name = frame.columns.values[0] + "_" + tpm_filename
        frame_path = os.path.join(job_context["work_dir"], sample_file_name)
        frame.to_csv(frame_path, sep="\t", encoding="utf-8")

        # The frame column header is based off of the path, which includes _output.
        sample_accession_code = frame.columns.values[0].replace("_output", "")
        sample = Sample.objects.get(accession_code=sample_accession_code)

        computed_file = ComputedFile()
        computed_file.absolute_file_path = frame_path
        computed_file.filename = sample_file_name
        computed_file.result = result
        computed_file.is_smashable = True
        computed_file.is_qc = False
        computed_file.is_public = True
        computed_file.calculate_sha1()
        computed_file.calculate_size()
        computed_file.save()
        job_context["computed_files"].append(computed_file)
        job_context["smashable_files"].append(computed_file)

        SampleResultAssociation.objects.get_or_create(sample=sample,
                                                      result=result)

        # Create association with the RDS file.
        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample, computed_file=rds_file)

        # Create association with TPM file.
        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample, computed_file=computed_file)

        individual_files.append(computed_file)
        job_context["samples"].append(sample)

    # Salmon-processed samples aren't marked as is_processed
    # until they are fully tximported, this value sets that
    # for the end_job function.
    job_context["tximported"] = True
    job_context["individual_files"] = individual_files
    return job_context
Esempio n. 14
0
def _populate_index_object(job_context: Dict) -> Dict:
    """ """
    result = ComputationalResult()
    result.commands.append(job_context["salmon_formatted_command"])
    try:
        processor_key = "TX_INDEX"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)

    result.is_ccdl = True
    result.time_start = job_context["time_start"]
    result.time_end = job_context["time_end"]
    result.save()
    job_context["pipeline"].steps.append(result.id)

    computed_file = ComputedFile()
    computed_file.absolute_file_path = job_context["computed_archive"]
    computed_file.filename = os.path.split(job_context["computed_archive"])[-1]
    computed_file.calculate_sha1()
    computed_file.calculate_size()
    computed_file.result = result
    computed_file.is_smashable = False
    computed_file.is_qc = False
    computed_file.save()

    organism_object = Organism.get_object_for_name(
        job_context["organism_name"])
    index_object = OrganismIndex()
    index_object.organism = organism_object
    index_object.database_name = job_context["database_name"]
    index_object.release_version = job_context["assembly_version"]
    index_object.assembly_name = job_context["assembly_name"]
    index_object.salmon_version = job_context["salmon_version"]
    index_object.index_type = "TRANSCRIPTOME_" + job_context["length"].upper()
    # This is where the index will be extracted to.
    index_object.absolute_directory_path = (LOCAL_ROOT_DIR +
                                            "/TRANSCRIPTOME_INDEX/" +
                                            organism_object.name + "/" +
                                            job_context["length"])
    index_object.result = result

    if S3_TRANSCRIPTOME_INDEX_BUCKET_NAME:
        logger.info(
            "Uploading %s %s to s3",
            job_context["organism_name"],
            job_context["length"],
            processor_job=job_context["job_id"],
        )
        timestamp = str(timezone.now().timestamp()).split(".")[0]
        s3_key = organism_object.name + "_" + index_object.index_type + "_" + timestamp + ".tar.gz"
        sync_result = computed_file.sync_to_s3(
            S3_TRANSCRIPTOME_INDEX_BUCKET_NAME, s3_key, public=True)
        if sync_result:
            computed_file.delete_local_file()
        else:
            computed_file.delete()

            raise utils.ProcessorJobError(
                "Failed to upload transcriptome index to S3",
                success=False,
                computed_file_id=computed_file.id,
            )
    else:
        logger.warn(
            "S3_TRANSCRIPTOME_INDEX_BUCKET_NAME not configured, therefore %s %s will not be uploaded.",
            job_context["organism_name"],
            job_context["length"],
            processor_job=job_context["job_id"],
        )

    index_object.save()

    # We uploaded the file ourselves since we wanted it to go to a
    # different bucket than end_job would put it in, therefore empty
    # this list so end_job doesn't try to upload it again.
    job_context["computed_files"] = []

    job_context["result"] = result
    job_context["computed_file"] = computed_file
    job_context["index"] = index_object

    # If there's not a long and a short index for this organism yet,
    # don't delete the input.
    # XXX: This will break once we introduce additional versions of these.
    short_indices = OrganismIndex.objects.filter(
        organism=organism_object,
        index_type="TRANSCRIPTOME_SHORT",
        release_version=job_context["assembly_version"],
    )
    long_indices = OrganismIndex.objects.filter(
        organism=organism_object,
        index_type="TRANSCRIPTOME_LONG",
        release_version=job_context["assembly_version"],
    )
    if short_indices.count() < 1 or long_indices.count() < 1:
        # utils.end_job deletes these, so remove them so it doesn't.
        job_context["original_files"] = []

    return job_context