コード例 #1
0
ファイル: test_salmon.py プロジェクト: Quiltomics/refinebio
def prepare_organism_indices():
    c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS")

    # This is a lie, but this image doesn't have the dependencies for TX_IMPORT
    computational_result_short = ComputationalResult(processor=utils.find_processor('SALMON_QUANT'))
    computational_result_short.save()

    organism_index = OrganismIndex()
    organism_index.index_type = "TRANSCRIPTOME_SHORT"
    organism_index.organism = c_elegans
    organism_index.result = computational_result_short
    organism_index.absolute_directory_path = "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT"
    organism_index.save()

    comp_file = ComputedFile()
    # This path will not be used because we already have the files extracted.
    comp_file.absolute_file_path = "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT/celgans_short.tar.gz"
    comp_file.result = computational_result_short
    comp_file.size_in_bytes=1337
    comp_file.sha1="ABC"
    comp_file.save()

    # This is a lie, but this image doesn't have the dependencies for TX_IMPORT
    computational_result_long = ComputationalResult(processor=utils.find_processor('SALMON_QUANT'))
    computational_result_long.save()

    organism_index = OrganismIndex()
    organism_index.index_type = "TRANSCRIPTOME_LONG"
    organism_index.organism = c_elegans
    organism_index.result = computational_result_long
    organism_index.absolute_directory_path = "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/LONG"
    organism_index.save()

    comp_file = ComputedFile()
    # This path will not be used because we already have the files extracted.
    comp_file.absolute_file_path = "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/LONG/celgans_long.tar.gz"
    comp_file.result = computational_result_long
    comp_file.size_in_bytes=1337
    comp_file.sha1="ABC"
    comp_file.save()
コード例 #2
0
    def handle(self, *args, **options):
        """ For every (or a supplied) organism, fetch all of the experiments and compile large but normally formated Dataset.

        Send all of them to the Smasher. Smash them. Retrieve manually as desired.
        """

        dataset_ids = []

        if options["organism"] is None:
            all_organisms = Organism.objects.all()
        else:
            all_organisms = [Organism.get_object_for_name(options["organism"].upper())]

        for organism in all_organisms:
            data = {}
            experiments = Experiment.objects.filter(id__in=(ExperimentOrganismAssociation.objects.filter(organism=organism)).values('experiment'))
            for experiment in experiments:
                data[experiment.accession_code] = list(experiment.samples.values_list('accession_code', flat=True))

            job = ProcessorJob()
            job.pipeline_applied = "COMPENDIA"
            job.save()

            dset = Dataset()
            dset.data = data
            dset.scale_by = 'NONE'
            dset.aggregate_by = 'SPECIES'
            dset.quantile_normalize = False
            dset.save()

            pjda = ProcessorJobDatasetAssociation()
            pjda.processor_job = job
            pjda.dataset = dset
            pjda.save()

            final_context = create_compendia.create_compendia(job.id)

        sys.exit(0)
コード例 #3
0
ファイル: test_salmon.py プロジェクト: Quiltomics/refinebio
    def test_double_reads(self):
        """Test outputs when the sample has both left and right reads."""
        job_context = {
            'job_id': 123,
            'job': ProcessorJob(),
            'pipeline': Pipeline(name="Salmon"),
            'input_file_path': self.test_dir + 'double_input/reads_1.fastq',
            'input_file_path_2': self.test_dir + 'double_input/reads_2.fastq',
            'salmontools_directory': self.test_dir + 'double_salmontools/',
            'salmontools_archive': self.test_dir + 'salmontools-result.tar.gz',
            'output_directory': self.test_dir + 'double_output/',
            'computed_files': []
        }
        os.makedirs(job_context["salmontools_directory"], exist_ok=True)

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")
        sample = Sample()
        sample.organism = homo_sapiens
        sample.save()
        job_context["sample"] = sample

        salmon._run_salmontools(job_context)

        # Confirm job status
        self.assertTrue(job_context["success"])

        # Unpack result for checking
        os.system('gunzip ' + job_context['salmontools_directory'] + "*.gz")

        # Check two output files
        output_file1 = job_context['salmontools_directory'] + 'unmapped_by_salmon_1.fa'
        expected_output_file1 = self.test_dir + 'expected_double_output/unmapped_by_salmon_1.fa'
        self.assertTrue(identical_checksum(output_file1, expected_output_file1))

        output_file2 = job_context['salmontools_directory'] + 'unmapped_by_salmon_2.fa'
        expected_output_file2 = self.test_dir + 'expected_double_output/unmapped_by_salmon_2.fa'
        self.assertTrue(identical_checksum(output_file2, expected_output_file2))
コード例 #4
0
def prepare_job(length):
    pj = ProcessorJob()
    pj.pipeline_applied = "TRANSCRIPTOME_INDEX_" + length.upper()
    pj.save()

    homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS",
                                                taxonomy_id=1001)

    samp = Sample()
    samp.organism = homo_sapiens
    samp.accession_code = "derp" + length
    samp.save()

    [og_file, og_file2] = prepare_original_files(length)

    og_file_samp_assoc = OriginalFileSampleAssociation()
    og_file_samp_assoc.original_file = og_file
    og_file_samp_assoc.sample = samp
    og_file_samp_assoc.save()

    og_file_samp_assoc2 = OriginalFileSampleAssociation()
    og_file_samp_assoc2.original_file = og_file2
    og_file_samp_assoc2.sample = samp
    og_file_samp_assoc2.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = pj
    assoc1.save()

    assoc2 = ProcessorJobOriginalFileAssociation()
    assoc2.original_file = og_file2
    assoc2.processor_job = pj
    assoc2.save()

    return pj
コード例 #5
0
def _perform_imputation(job_context: Dict) -> Dict:
    """

    Take the inputs and perform the primary imputation.

    Via https://github.com/AlexsLemonade/refinebio/issues/508#issuecomment-435879283: 
     - Combine all microarray samples with a full join to form a microarray_expression_matrix (this may end up being a DataFrame)
     - Combine all RNA-seq samples (lengthScaledTPM) with a full outer join to form a rnaseq_expression_matrix
     - Calculate the sum of the lengthScaledTPM values for each row (gene) of the rnaseq_expression_matrix (rnaseq_row_sums)
     - Calculate the 10th percentile of rnaseq_row_sums
     - Drop all rows in rnaseq_expression_matrix with a row sum < 10th percentile of rnaseq_row_sums; this is now filtered_rnaseq_matrix
     - log2(x + 1) transform filtered_rnaseq_matrix; this is now log2_rnaseq_matrix
     - Set all zero values in log2_rnaseq_matrix to NA, but make sure to keep track of where these zeroes are
     - Perform a full outer join of microarray_expression_matrix and log2_rnaseq_matrix; combined_matrix
     - Remove genes (rows) with >30% missing values in combined_matrix
     - Remove samples (columns) with >50% missing values in combined_matrix
     - "Reset" zero values that were set to NA in RNA-seq samples (i.e., make these zero again) in combined_matrix
     - Transpose combined_matrix; transposed_matrix
     - Perform imputation of missing values with IterativeSVD (rank=10) on the transposed_matrix; imputed_matrix
     - Untranspose imputed_matrix (genes are now rows, samples are now columns)
     - Quantile normalize imputed_matrix where genes are rows and samples are columns

    """
    job_context['time_start'] = timezone.now()

    # Combine all microarray samples with a full join to form a microarray_expression_matrix (this may end up being a DataFrame)
    microarray_expression_matrix = job_context['microarray_inputs']

    # Combine all RNA-seq samples (lengthScaledTPM) with a full outer join to form a rnaseq_expression_matrix
    rnaseq_expression_matrix = job_context['rnaseq_inputs']

    # Calculate the sum of the lengthScaledTPM values for each row (gene) of the rnaseq_expression_matrix (rnaseq_row_sums)
    rnaseq_row_sums = np.sum(rnaseq_expression_matrix, axis=1)

    # Calculate the 10th percentile of rnaseq_row_sums
    rnaseq_tenth_percentile = np.percentile(rnaseq_row_sums, 10)

    # Drop all rows in rnaseq_expression_matrix with a row sum < 10th percentile of rnaseq_row_sums; this is now filtered_rnaseq_matrix
    # TODO: This is probably a better way to do this with `np.where`
    rows_to_filter = []
    for (x, sum_val) in rnaseq_row_sums.items():
        if sum_val < rnaseq_tenth_percentile:
            rows_to_filter.append(x)

    filtered_rnaseq_matrix = rnaseq_expression_matrix.drop(rows_to_filter)

    # log2(x + 1) transform filtered_rnaseq_matrix; this is now log2_rnaseq_matrix
    filtered_rnaseq_matrix_plus_one = filtered_rnaseq_matrix + 1
    log2_rnaseq_matrix = np.log2(filtered_rnaseq_matrix_plus_one)

    # Cache our RNA-Seq zero values
    cached_zeroes = {}
    for column in log2_rnaseq_matrix.columns:
        cached_zeroes[column] = np.where(log2_rnaseq_matrix[column] == 0)

    # Set all zero values in log2_rnaseq_matrix to NA, but make sure to keep track of where these zeroes are
    log2_rnaseq_matrix[log2_rnaseq_matrix==0]=np.nan

    # Perform a full outer join of microarray_expression_matrix and log2_rnaseq_matrix; combined_matrix
    combined_matrix = microarray_expression_matrix.merge(log2_rnaseq_matrix, how='outer', left_index=True, right_index=True)

    # Remove genes (rows) with <=70% present values in combined_matrix
    thresh = combined_matrix.shape[1] * .7 # (Rows, Columns)
    row_filtered_combined_matrix = combined_matrix.dropna(axis='index', thresh=thresh) # Everything below `thresh` is dropped

    # Remove samples (columns) with <50% present values in combined_matrix
    # XXX: Find better test data for this!
    col_thresh = row_filtered_combined_matrix.shape[0] * .5
    row_col_filtered_combined_matrix_samples = row_filtered_combined_matrix.dropna(axis='columns', thresh=col_thresh)

    # "Reset" zero values that were set to NA in RNA-seq samples (i.e., make these zero again) in combined_matrix
    for column in cached_zeroes.keys():
        zeroes = cached_zeroes[column]

        # Skip purged columns
        if column not in row_col_filtered_combined_matrix_samples:
            continue
        
        # Place the zero
        try:
            np.put(row_col_filtered_combined_matrix_samples[column], zeroes, 0.0)
        except Exception as e:
            logger.exception("Error when replacing zero")
            continue

    # Label our new replaced data
    combined_matrix_zero = row_col_filtered_combined_matrix_samples

    # Transpose combined_matrix; transposed_matrix
    transposed_matrix = combined_matrix_zero.transpose() #  row_col_filtered_combined_matrix_samples.transpose()

    # Remove -inf and inf
    # This should never happen, but make sure it doesn't!
    transposed_matrix = transposed_matrix.replace([np.inf, -np.inf], np.nan)

    # Perform imputation of missing values with IterativeSVD (rank=10) on the transposed_matrix; imputed_matrix
    imputed_matrix = IterativeSVD(rank=10).fit_transform(transposed_matrix)

    # Untranspose imputed_matrix (genes are now rows, samples are now columns)
    untransposed_imputed_matrix = imputed_matrix.transpose()

    # Convert back to Pandas
    untransposed_imputed_matrix_df = pd.DataFrame.from_records(untransposed_imputed_matrix)
    untransposed_imputed_matrix_df.index = row_col_filtered_combined_matrix_samples.index
    untransposed_imputed_matrix_df.columns = row_col_filtered_combined_matrix_samples.columns

    # Quantile normalize imputed_matrix where genes are rows and samples are columns
    # XXX: Refactor QN target acquisition and application before doing this
    job_context['organism'] = Organism.get_object_for_name(list(job_context['input_files'].keys())[0])
    job_context['merged_no_qn'] = untransposed_imputed_matrix_df

    # Perform the Quantile Normalization
    job_context = smasher._quantile_normalize(job_context, ks_check=False)
    job_context['time_end'] = timezone.now()
    job_context['formatted_command'] = "create_compendia.py"

    return job_context
コード例 #6
0
def prepare_computed_files():
    # MICROARRAY TECH
    experiment = Experiment()
    experiment.accession_code = "GSE1487313"
    experiment.num_processed_samples = 1
    experiment.save()

    result = ComputationalResult()
    result.save()

    gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS",
                                                 taxonomy_id=1001)

    sample = Sample()
    sample.accession_code = "GSM1487313"
    sample.title = "GSM1487313"
    sample.organism = gallus_gallus
    sample.technology = "MICROARRAY"
    sample.is_processed = True
    sample.save()

    sra = SampleResultAssociation()
    sra.sample = sample
    sra.result = result
    sra.save()

    esa = ExperimentSampleAssociation()
    esa.experiment = experiment
    esa.sample = sample
    esa.save()

    computed_file = ComputedFile()
    computed_file.filename = "GSM1487313_liver.PCL"
    computed_file.result = result
    computed_file.size_in_bytes = 123
    computed_file.is_smashable = True
    computed_file.s3_key = "GSM1487313_liver.PCL"
    computed_file.s3_bucket = TEST_DATA_BUCKET
    computed_file.save()

    assoc = SampleComputedFileAssociation()
    assoc.sample = sample
    assoc.computed_file = computed_file
    assoc.save()

    # RNASEQ TECH
    experiment2 = Experiment()
    experiment2.accession_code = "SRP332914"
    experiment2.num_processed_samples = 1
    experiment2.save()

    result2 = ComputationalResult()
    result2.save()

    sample2 = Sample()
    sample2.accession_code = "SRR332914"
    sample2.title = "SRR332914"
    sample2.organism = gallus_gallus
    sample2.technology = "RNA-SEQ"
    sample2.is_processed = True
    sample2.save()

    sra2 = SampleResultAssociation()
    sra2.sample = sample2
    sra2.result = result2
    sra2.save()

    esa2 = ExperimentSampleAssociation()
    esa2.experiment = experiment2
    esa2.sample = sample2
    esa2.save()

    computed_file2 = ComputedFile()
    computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv"
    computed_file2.result = result2
    computed_file2.size_in_bytes = 234
    computed_file2.is_smashable = True
    computed_file2.s3_key = "SRP149598_gene_lengthScaledTPM.tsv"
    computed_file2.s3_bucket = TEST_DATA_BUCKET
    computed_file2.save()

    assoc2 = SampleComputedFileAssociation()
    assoc2.sample = sample2
    assoc2.computed_file = computed_file2
    assoc2.save()
コード例 #7
0
ファイル: sra.py プロジェクト: arjunkrish/refinebio
    def _generate_experiment_and_samples(
            self,
            run_accession: str,
            study_accession: str = None) -> (Experiment, List[Sample]):
        """Generates Experiments and Samples for the provided run_accession."""
        metadata = SraSurveyor.gather_all_metadata(run_accession)

        if metadata == {}:
            if study_accession:
                logger.error(
                    "Could not discover any metadata for run.",
                    accession=run_accession,
                    study_accession=study_accession,
                )
            else:
                logger.error("Could not discover any metadata for run.",
                             accession=run_accession)
            return (None, None)  # This will cascade properly

        if DOWNLOAD_SOURCE == "ENA":
            if metadata["library_layout"] == "PAIRED":
                files_urls = [
                    _build_ena_file_url(run_accession, "_1"),
                    _build_ena_file_url(run_accession, "_2"),
                ]
            else:
                files_urls = [_build_ena_file_url(run_accession)]
        else:
            files_urls = [SraSurveyor._build_ncbi_file_url(run_accession)]

        # Figure out the Organism for this sample
        organism_name = metadata.pop("organism_name", None)
        if not organism_name:
            logger.error("Could not discover organism type for run.",
                         accession=run_accession)
            return (None, None)  # This will cascade properly

        organism_name = organism_name.upper()
        organism = Organism.get_object_for_name(organism_name)

        ##
        # Experiment
        ##

        experiment_accession_code = metadata.get("study_accession")
        try:
            experiment_object = Experiment.objects.get(
                accession_code=experiment_accession_code)
            logger.debug(
                "Experiment already exists, skipping object creation.",
                experiment_accession_code=experiment_accession_code,
                survey_job=self.survey_job.id,
            )
        except Experiment.DoesNotExist:
            experiment_object = Experiment()
            experiment_object.accession_code = experiment_accession_code
            SraSurveyor._apply_metadata_to_experiment(experiment_object,
                                                      metadata)
            experiment_object.save()

            ##
            # Experiment Metadata
            ##
            json_xa = ExperimentAnnotation()
            json_xa.experiment = experiment_object
            json_xa.data = metadata
            json_xa.is_ccdl = False
            json_xa.save()

        ##
        # Samples
        ##

        sample_accession_code = metadata.pop("run_accession")
        # Create the sample object
        try:
            sample_object = Sample.objects.get(
                accession_code=sample_accession_code)
            # If current experiment includes new protocol information,
            # merge it into the sample's existing protocol_info.
            protocol_info, is_updated = self.update_sample_protocol_info(
                sample_object.protocol_info,
                experiment_object.protocol_description,
                experiment_object.source_url,
            )
            if is_updated:
                sample_object.protocol_info = protocol_info
                sample_object.save()

            logger.debug(
                "Sample %s already exists, skipping object creation.",
                sample_accession_code,
                experiment_accession_code=experiment_object.accession_code,
                survey_job=self.survey_job.id,
            )
        except Sample.DoesNotExist:
            sample_object = Sample()
            sample_object.source_database = "SRA"
            sample_object.accession_code = sample_accession_code
            sample_object.organism = organism

            sample_object.platform_name = metadata.get(
                "platform_instrument_model", "UNKNOWN")
            # The platform_name is human readable and contains spaces,
            # accession codes shouldn't have spaces though:
            sample_object.platform_accession_code = sample_object.platform_name.replace(
                " ", "")
            sample_object.technology = "RNA-SEQ"
            if ("ILLUMINA" in sample_object.platform_name.upper()
                    or "NEXTSEQ" in sample_object.platform_name.upper()):
                sample_object.manufacturer = "ILLUMINA"
            elif "ION TORRENT" in sample_object.platform_name.upper():
                sample_object.manufacturer = "ION_TORRENT"
            else:
                sample_object.manufacturer = "UNKNOWN"

            SraSurveyor._apply_harmonized_metadata_to_sample(
                sample_object, metadata)

            protocol_info, is_updated = self.update_sample_protocol_info(
                existing_protocols=[],
                experiment_protocol=experiment_object.protocol_description,
                experiment_url=experiment_object.source_url,
            )
            # Do not check is_updated the first time because we must
            # save a list so we can append to it later.
            sample_object.protocol_info = protocol_info

            sample_object.save()

            for file_url in files_urls:
                original_file = OriginalFile.objects.get_or_create(
                    source_url=file_url,
                    source_filename=file_url.split("/")[-1],
                    has_raw=True)[0]
                OriginalFileSampleAssociation.objects.get_or_create(
                    original_file=original_file, sample=sample_object)

        # Create associations if they don't already exist
        ExperimentSampleAssociation.objects.get_or_create(
            experiment=experiment_object, sample=sample_object)

        ExperimentOrganismAssociation.objects.get_or_create(
            experiment=experiment_object, organism=organism)

        return experiment_object, [sample_object]
コード例 #8
0
ファイル: geo.py プロジェクト: erflynn/refinebio
    def create_experiment_and_samples_from_api(
            self, experiment_accession_code) -> (Experiment, List[Sample]):
        """ The main surveyor - find the Experiment and Samples from NCBI GEO.

        Uses the GEOParse library, for which docs can be found here: https://geoparse.readthedocs.io/en/latest/usage.html#working-with-geo-objects

        """
        # Cleaning up is tracked here: https://github.com/guma44/GEOparse/issues/41
        gse = GEOparse.get_GEO(experiment_accession_code,
                               destdir=self.get_temp_path(),
                               how="brief",
                               silent=True)
        preprocessed_samples = harmony.preprocess_geo(gse.gsms.items())
        harmonized_samples = harmony.harmonize(preprocessed_samples)

        # Create the experiment object
        try:
            experiment_object = Experiment.objects.get(
                accession_code=experiment_accession_code)
            logger.debug(
                "Experiment %s already exists, skipping object creation.",
                experiment_accession_code,
                survey_job=self.survey_job.id,
            )
        except Experiment.DoesNotExist:
            experiment_object = Experiment()
            experiment_object.accession_code = experiment_accession_code
            GeoSurveyor._apply_metadata_to_experiment(experiment_object, gse)
            experiment_object.save()

            experiment_annotation = ExperimentAnnotation()
            experiment_annotation.data = gse.metadata
            experiment_annotation.experiment = experiment_object
            experiment_annotation.is_ccdl = False
            experiment_annotation.save()

        # Okay, here's the situation!
        # Sometimes, samples have a direct single representation for themselves.
        # Othertimes, there is a single file with references to every sample in it.
        created_samples = []
        for sample_accession_code, sample in gse.gsms.items():

            try:
                sample_object = Sample.objects.get(
                    accession_code=sample_accession_code)
                logger.debug(
                    "Sample %s from experiment %s already exists, skipping object creation.",
                    sample_accession_code,
                    experiment_object.accession_code,
                    survey_job=self.survey_job.id,
                )

                # Associate it with the experiment, but since it
                # already exists it already has original files
                # associated with it and it's already been downloaded,
                # so don't add it to created_samples.
                ExperimentSampleAssociation.objects.get_or_create(
                    experiment=experiment_object, sample=sample_object)

                ExperimentOrganismAssociation.objects.get_or_create(
                    experiment=experiment_object,
                    organism=sample_object.organism)
            except Sample.DoesNotExist:
                organism = Organism.get_object_for_name(
                    sample.metadata["organism_ch1"][0].upper())

                sample_object = Sample()
                sample_object.source_database = "GEO"
                sample_object.accession_code = sample_accession_code
                sample_object.organism = organism

                # If data processing step, it isn't raw.
                sample_object.has_raw = not sample.metadata.get(
                    "data_processing", None)

                ExperimentOrganismAssociation.objects.get_or_create(
                    experiment=experiment_object, organism=organism)
                sample_object.title = sample.metadata["title"][0]

                self.set_platform_properties(sample_object, sample.metadata,
                                             gse)

                GeoSurveyor._apply_harmonized_metadata_to_sample(
                    sample_object, harmonized_samples[sample_object.title])

                # Sample-level protocol_info
                sample_object.protocol_info = self.get_sample_protocol_info(
                    sample.metadata, sample_accession_code)

                sample_object.save()
                logger.debug("Created Sample: " + str(sample_object))

                sample_annotation = SampleAnnotation()
                sample_annotation.sample = sample_object
                sample_annotation.data = sample.metadata
                sample_annotation.is_ccdl = False
                sample_annotation.save()

                sample_supplements = sample.metadata.get(
                    "supplementary_file", [])
                for supplementary_file_url in sample_supplements:

                    # Why do they give us this?
                    if supplementary_file_url == "NONE":
                        break

                    # We never want these!
                    if "idat.gz" in supplementary_file_url.lower():
                        continue
                    if "chp.gz" in supplementary_file_url.lower():
                        continue
                    if "ndf.gz" in supplementary_file_url.lower():
                        continue
                    if "pos.gz" in supplementary_file_url.lower():
                        continue
                    if "pair.gz" in supplementary_file_url.lower():
                        continue
                    if "gff.gz" in supplementary_file_url.lower():
                        continue

                    # Sometimes, we are lied to about the data processing step.
                    lower_file_url = supplementary_file_url.lower()
                    if (".cel" in lower_file_url
                            or ("_non_normalized.txt" in lower_file_url)
                            or ("_non-normalized.txt" in lower_file_url)
                            or ("-non-normalized.txt" in lower_file_url)
                            or ("-non_normalized.txt" in lower_file_url)):
                        sample_object.has_raw = True
                        sample_object.save()

                    # filename and source_filename are the same for these
                    filename = FileUtils.get_filename(supplementary_file_url)
                    original_file = OriginalFile.objects.get_or_create(
                        source_url=supplementary_file_url,
                        filename=filename,
                        source_filename=filename,
                        has_raw=sample_object.has_raw,
                        is_archive=FileUtils.is_archive(filename),
                    )[0]

                    logger.debug("Created OriginalFile: " + str(original_file))

                    original_file_sample_association = OriginalFileSampleAssociation.objects.get_or_create(
                        original_file=original_file, sample=sample_object)

                    if original_file.is_affy_data():
                        # Only Affymetrix Microarrays produce .CEL files
                        sample_object.technology = "MICROARRAY"
                        sample_object.manufacturer = "AFFYMETRIX"
                        sample_object.save()

                # It's okay to survey RNA-Seq samples from GEO, but we
                # don't actually want to download/process any RNA-Seq
                # data unless it comes from SRA.
                if sample_object.technology != "RNA-SEQ":
                    created_samples.append(sample_object)

                # Now that we've determined the technology at the
                # sample level, we can set it at the experiment level,
                # just gotta make sure to only do it once. There can
                # be more than one technology, this should be changed
                # as part of:
                # https://github.com/AlexsLemonade/refinebio/issues/1099
                if not experiment_object.technology:
                    experiment_object.technology = sample_object.technology
                    experiment_object.save()

                ExperimentSampleAssociation.objects.get_or_create(
                    experiment=experiment_object, sample=sample_object)

        # These supplementary files _may-or-may-not_ contain the type of raw data we can process.
        for experiment_supplement_url in gse.metadata.get(
                "supplementary_file", []):

            # filename and source_filename are the same for these
            filename = experiment_supplement_url.split("/")[-1]
            original_file = OriginalFile.objects.get_or_create(
                source_url=experiment_supplement_url,
                filename=filename,
                source_filename=filename,
                has_raw=sample_object.has_raw,
                is_archive=True,
            )[0]

            logger.debug("Created OriginalFile: " + str(original_file))

            lower_supplement_url = experiment_supplement_url.lower()
            if (("_non_normalized.txt" in lower_supplement_url)
                    or ("_non-normalized.txt" in lower_supplement_url)
                    or ("-non-normalized.txt" in lower_supplement_url)
                    or ("-non_normalized.txt" in lower_supplement_url)):
                for sample_object in created_samples:
                    sample_object.has_raw = True
                    sample_object.save()

                    OriginalFileSampleAssociation.objects.get_or_create(
                        sample=sample_object, original_file=original_file)

            # Delete this Original file if it isn't being used.
            if (OriginalFileSampleAssociation.objects.filter(
                    original_file=original_file).count() == 0):
                original_file.delete()

        # These are the Miniml/Soft/Matrix URLs that are always(?) provided.
        # GEO describes different types of data formatting as "families"
        family_url = self.get_miniml_url(experiment_accession_code)
        miniml_original_file = OriginalFile.objects.get_or_create(
            source_url=family_url,
            source_filename=family_url.split("/")[-1],
            has_raw=sample_object.has_raw,
            is_archive=True,
        )[0]
        for sample_object in created_samples:
            # We don't need a .txt if we have a .CEL
            if sample_object.has_raw:
                continue
            OriginalFileSampleAssociation.objects.get_or_create(
                sample=sample_object, original_file=miniml_original_file)

        # Delete this Original file if it isn't being used.
        if (OriginalFileSampleAssociation.objects.filter(
                original_file=miniml_original_file).count() == 0):
            miniml_original_file.delete()

        # Trash the temp path
        try:
            shutil.rmtree(self.get_temp_path())
        except Exception:
            # There was a problem during surveying so this didn't get created.
            # It's not a big deal.
            pass

        return experiment_object, created_samples
コード例 #9
0
ファイル: test_smasher.py プロジェクト: Quiltomics/refinebio
    def test_log2(self):
        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        # Has non-log2 data:
        # https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE44421
        # ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE44nnn/GSE44421/miniml/GSE44421_family.xml.tgz
        experiment = Experiment()
        experiment.accession_code = "GSE44421"
        experiment.save()

        result = ComputationalResult()
        result.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

        sample = Sample()
        sample.accession_code = 'GSM1084806'
        sample.title = 'GSM1084806'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1084806-tbl-1.txt"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sample = Sample()
        sample.accession_code = 'GSM1084807'
        sample.title = 'GSM1084807'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1084807-tbl-1.txt"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE44421': ['GSM1084806', 'GSM1084807']}
        ds.aggregate_by = 'EXPERIMENT'
        ds.scale_by = 'MINMAX'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(pj.pk, upload=False)
        ds = Dataset.objects.get(id=ds.id)

        self.assertTrue(final_context['success'])
コード例 #10
0
ファイル: test_smasher.py プロジェクト: Quiltomics/refinebio
    def test_no_smash_dupe(self):
        """ """

        job = ProcessorJob()
        job.pipeline_applied = "SMASHER"
        job.save()

        experiment = Experiment()
        experiment.accession_code = "GSE51081"
        experiment.save()

        result = ComputationalResult()
        result.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

        sample = Sample()
        sample.accession_code = 'GSM1237810'
        sample.title = 'GSM1237810'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1237810_T09-1084.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        result = ComputationalResult()
        result.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sample = Sample()
        sample.accession_code = 'GSM1237811'
        sample.title = 'GSM1237811'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        result = ComputationalResult()
        result.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE51081': ['GSM1237810', 'GSM1237811']}
        ds.aggregate_by = 'ALL'
        ds.scale_by = 'STANDARD'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(job.pk, upload=False)

        dsid = ds.id
        ds = Dataset.objects.get(id=dsid)

        self.assertTrue(ds.success)
        for column in final_context['original_merged'].columns:
            self.assertTrue('_x' not in column)
コード例 #11
0
ファイル: test_smasher.py プロジェクト: Quiltomics/refinebio
def prepare_job():
    pj = ProcessorJob()
    pj.pipeline_applied = "SMASHER"
    pj.save()

    experiment = Experiment()
    experiment.accession_code = "GSE51081"
    experiment.save()

    result = ComputationalResult()
    result.save()

    homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

    sample = Sample()
    sample.accession_code = 'GSM1237810'
    sample.title = 'GSM1237810'
    sample.organism = homo_sapiens
    sample.save()

    sample_annotation = SampleAnnotation()
    sample_annotation.data = {'hi': 'friend'}
    sample_annotation.sample = sample
    sample_annotation.save()

    sra = SampleResultAssociation()
    sra.sample = sample
    sra.result = result
    sra.save()

    esa = ExperimentSampleAssociation()
    esa.experiment = experiment
    esa.sample = sample
    esa.save()

    computed_file = ComputedFile()
    computed_file.filename = "GSM1237810_T09-1084.PCL"
    computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
    computed_file.result = result
    computed_file.size_in_bytes = 123
    computed_file.is_smashable = True
    computed_file.save()

    assoc = SampleComputedFileAssociation()
    assoc.sample = sample
    assoc.computed_file = computed_file
    assoc.save()

    sample = Sample()
    sample.accession_code = 'GSM1237812'
    sample.title = 'GSM1237812'
    sample.organism = homo_sapiens
    sample.save()

    esa = ExperimentSampleAssociation()
    esa.experiment = experiment
    esa.sample = sample
    esa.save()

    assoc = SampleComputedFileAssociation()
    assoc.sample = sample
    assoc.computed_file = computed_file
    assoc.save()

    sra = SampleResultAssociation()
    sra.sample = sample
    sra.result = result
    sra.save()

    computed_file = ComputedFile()
    computed_file.filename = "GSM1237812_S97-PURE.PCL"
    computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
    computed_file.result = result
    computed_file.size_in_bytes = 123
    computed_file.is_smashable = True
    computed_file.save()

    computed_file = ComputedFile()
    computed_file.filename = "GSM1237812_S97-PURE.DAT"
    computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
    computed_file.result = result
    computed_file.size_in_bytes = 123
    computed_file.is_smashable = False
    computed_file.save()

    assoc = SampleComputedFileAssociation()
    assoc.sample = sample
    assoc.computed_file = computed_file
    assoc.save()

    ds = Dataset()
    ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812']}
    ds.aggregate_by = 'EXPERIMENT'  # [ALL or SPECIES or EXPERIMENT]
    ds.scale_by = 'STANDARD'  # [NONE or MINMAX or STANDARD or ROBUST]
    ds.email_address = "*****@*****.**"
    #ds.email_address = "*****@*****.**"
    ds.quantile_normalize = False
    ds.save()

    pjda = ProcessorJobDatasetAssociation()
    pjda.processor_job = pj
    pjda.dataset = ds
    pjda.save()

    return pj
コード例 #12
0
ファイル: test_compendia.py プロジェクト: erflynn/refinebio
    def test_create_compendia(self):
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1487313"
        experiment.save()

        result = ComputationalResult()
        result.save()

        gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS",
                                                     taxonomy_id=1001)

        sample = Sample()
        sample.accession_code = "GSM1487313"
        sample.title = "GSM1487313"
        sample.organism = gallus_gallus
        sample.technology = "MICROARRAY"
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1487313_liver.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        # Missing sample that will be filtered
        sample = Sample()
        sample.accession_code = "GSM1487222"
        sample.title = "this sample will be filtered"
        sample.organism = gallus_gallus
        sample.technology = "MICROARRAY"
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1487222_empty.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/doesnt_exists.PCL"
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        # RNASEQ TECH
        experiment2 = Experiment()
        experiment2.accession_code = "SRS332914"
        experiment2.save()

        result2 = ComputationalResult()
        result2.save()

        sample2 = Sample()
        sample2.accession_code = "SRS332914"
        sample2.title = "SRS332914"
        sample2.organism = gallus_gallus
        sample2.technology = "RNA-SEQ"
        sample2.save()

        sra2 = SampleResultAssociation()
        sra2.sample = sample2
        sra2.result = result2
        sra2.save()

        esa2 = ExperimentSampleAssociation()
        esa2.experiment = experiment2
        esa2.sample = sample2
        esa2.save()

        computed_file2 = ComputedFile()
        computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv"
        computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename
        computed_file2.result = result2
        computed_file2.size_in_bytes = 234
        computed_file2.is_smashable = True
        computed_file2.save()

        assoc2 = SampleComputedFileAssociation()
        assoc2.sample = sample2
        assoc2.computed_file = computed_file2
        assoc2.save()

        dset = Dataset()
        dset.data = {
            "GSE1487313": ["GSM1487313", "GSM1487222"],
            "SRX332914": ["SRS332914"]
        }
        dset.scale_by = "NONE"
        dset.aggregate_by = "SPECIES"
        dset.svd_algorithm = "ARPACK"
        dset.quantile_normalize = False
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        final_context = create_compendia.create_compendia(job.id)

        self.assertFalse(job.success)

        # check that sample with no computed file was skipped
        self.assertTrue("GSM1487222" in final_context["filtered_samples"])
        self.assertEqual(
            final_context["filtered_samples"]["GSM1487222"]
            ["experiment_accession_code"],
            "GSE1487313",
        )
コード例 #13
0
    def _generate_experiment_and_samples(
            self,
            run_accession: str,
            study_accession: str = None) -> (Experiment, List[Sample]):
        """Generates Experiments and Samples for the provided run_accession."""
        metadata = SraSurveyor.gather_all_metadata(run_accession)

        if metadata == {}:
            if study_accession:
                logger.error("Could not discover any metadata for run.",
                             accession=run_accession,
                             study_accession=study_accession)
            else:
                logger.error("Could not discover any metadata for run.",
                             accession=run_accession)
            return (None, None)  # This will cascade properly

        if DOWNLOAD_SOURCE == "ENA":
            if metadata["library_layout"] == "PAIRED":
                files_urls = [
                    SraSurveyor._build_ena_file_url(run_accession, "_1"),
                    SraSurveyor._build_ena_file_url(run_accession, "_2")
                ]
            else:
                files_urls = [SraSurveyor._build_ena_file_url(run_accession)]
        else:
            files_urls = [SraSurveyor._build_ncbi_file_url(run_accession)]

        # Figure out the Organism for this sample
        organism_name = metadata.pop("organism_name", None)
        if not organism_name:
            logger.error("Could not discover organism type for run.",
                         accession=run_accession)
            return (None, None)  # This will cascade properly

        organism_name = organism_name.upper()
        organism = Organism.get_object_for_name(organism_name)

        ##
        # Experiment
        ##

        experiment_accession_code = metadata.get('study_accession')
        try:
            experiment_object = Experiment.objects.get(
                accession_code=experiment_accession_code)
            logger.debug(
                "Experiment already exists, skipping object creation.",
                experiment_accession_code=experiment_accession_code,
                survey_job=self.survey_job.id)
        except Experiment.DoesNotExist:
            experiment_object = Experiment()
            experiment_object.accession_code = experiment_accession_code
            experiment_object.source_url = ENA_URL_TEMPLATE.format(
                experiment_accession_code)
            experiment_object.source_database = "SRA"
            experiment_object.technology = "RNA-SEQ"

            # We don't get this value from the API, unfortunately.
            # experiment_object.platform_accession_code = experiment["platform_accession_code"]

            if not experiment_object.description:
                experiment_object.description = "No description."

            if "study_title" in metadata:
                experiment_object.title = metadata["study_title"]
            if "study_abstract" in metadata:
                experiment_object.description = metadata["study_abstract"]
            if "lab_name" in metadata:
                experiment_object.submitter_institution = metadata["lab_name"]
            if "experiment_design_description" in metadata:
                experiment_object.protocol_description = metadata[
                    "experiment_design_description"]
            if "pubmed_id" in metadata:
                experiment_object.pubmed_id = metadata["pubmed_id"]
                experiment_object.has_publication = True
            if "study_ena_first_public" in metadata:
                experiment_object.source_first_published = parse_datetime(
                    metadata["study_ena_first_public"])
            if "study_ena_last_update" in metadata:
                experiment_object.source_last_modified = parse_datetime(
                    metadata["study_ena_last_update"])

            # Rare, but it happens.
            if not experiment_object.protocol_description:
                experiment_object.protocol_description = metadata.get(
                    "library_construction_protocol",
                    "Protocol was never provided.")
            # Scrape publication title and authorship from Pubmed
            if experiment_object.pubmed_id:
                pubmed_metadata = utils.get_title_and_authors_for_pubmed_id(
                    experiment_object.pubmed_id)
                experiment_object.publication_title = pubmed_metadata[0]
                experiment_object.publication_authors = pubmed_metadata[1]

            experiment_object.save()

            ##
            # Experiment Metadata
            ##
            json_xa = ExperimentAnnotation()
            json_xa.experiment = experiment_object
            json_xa.data = metadata
            json_xa.is_ccdl = False
            json_xa.save()

        ##
        # Samples
        ##

        sample_accession_code = metadata.pop('run_accession')
        # Create the sample object
        try:
            sample_object = Sample.objects.get(
                accession_code=sample_accession_code)
            # If current experiment includes new protocol information,
            # merge it into the sample's existing protocol_info.
            protocol_info, is_updated = self.update_sample_protocol_info(
                sample_object.protocol_info,
                experiment_object.protocol_description,
                experiment_object.source_url)
            if is_updated:
                sample_object.protocol_info = protocol_info
                sample_object.save()

            logger.debug(
                "Sample %s already exists, skipping object creation.",
                sample_accession_code,
                experiment_accession_code=experiment_object.accession_code,
                survey_job=self.survey_job.id)
        except Sample.DoesNotExist:
            sample_object = Sample()
            sample_object.source_database = "SRA"
            sample_object.accession_code = sample_accession_code
            sample_object.organism = organism

            sample_object.platform_name = metadata.get(
                "platform_instrument_model", "UNKNOWN")
            # The platform_name is human readable and contains spaces,
            # accession codes shouldn't have spaces though:
            sample_object.platform_accession_code = sample_object.platform_name.replace(
                " ", "")
            sample_object.technology = "RNA-SEQ"
            if "ILLUMINA" in sample_object.platform_name.upper() \
            or "NEXTSEQ" in sample_object.platform_name.upper():
                sample_object.manufacturer = "ILLUMINA"
            elif "ION TORRENT" in sample_object.platform_name.upper():
                sample_object.manufacturer = "ION_TORRENT"
            else:
                sample_object.manufacturer = "UNKNOWN"

            # Directly apply the harmonized values
            sample_object.title = harmony.extract_title(metadata)
            harmonized_sample = harmony.harmonize([metadata])
            for key, value in harmonized_sample.items():
                setattr(sample_object, key, value)

            protocol_info, is_updated = self.update_sample_protocol_info(
                existing_protocols=[],
                experiment_protocol=experiment_object.protocol_description,
                experiment_url=experiment_object.source_url)
            # Do not check is_updated the first time because we must
            # save a list so we can append to it later.
            sample_object.protocol_info = protocol_info

            sample_object.save()

            for file_url in files_urls:
                original_file = OriginalFile.objects.get_or_create(
                    source_url=file_url,
                    source_filename=file_url.split('/')[-1],
                    has_raw=True)[0]
                original_file_sample_association = OriginalFileSampleAssociation.objects.get_or_create(
                    original_file=original_file, sample=sample_object)

        # Create associations if they don't already exist
        ExperimentSampleAssociation.objects.get_or_create(
            experiment=experiment_object, sample=sample_object)

        ExperimentOrganismAssociation.objects.get_or_create(
            experiment=experiment_object, organism=organism)

        return experiment_object, [sample_object]
コード例 #14
0
def setup_experiments() -> None:
    """Creates three experiments for testing purposes.

    One experiment will not have a GSE* accession code.
    Both experiments with GSE* accession codes will have two
    samples. One has a sample that has incorrect platform information
    so the experiment will need to be re-surveyed.
    """
    organism = Organism.get_object_for_name("HOMO_SAPIENS", taxonomy_id=9606)

    # Experiment that needs to be re-surveyed
    experiment = Experiment.objects.create(accession_code="GSE12417",
                                           technology="MICROARRAY",
                                           source_database="GEO")

    # Correct platform
    sample = Sample.objects.create(
        accession_code="GSM311750",
        source_database="GEO",
        technology="MICROARRAY",
        platform_accession_code="hgu133a",
    )
    ExperimentSampleAssociation.objects.create(experiment=experiment,
                                               sample=sample)

    # Incorrect Platform
    sample = Sample.objects.create(
        accession_code="GSM316652",
        organism=organism,
        source_database="GEO",
        technology="MICROARRAY",
        platform_accession_code="hgu133a",
    )
    ExperimentSampleAssociation.objects.create(experiment=experiment,
                                               sample=sample)

    # Experiment that does not need to be re-surveyed
    experiment = Experiment.objects.create(accession_code="GSE9890",
                                           technology="MICROARRAY",
                                           source_database="GEO")

    sample = Sample.objects.create(
        accession_code="GSM249671",
        source_database="GEO",
        technology="MICROARRAY",
        platform_accession_code="hgu133plus2",
    )
    ExperimentSampleAssociation.objects.create(experiment=experiment,
                                               sample=sample)

    sample = Sample.objects.create(
        accession_code="GSM249672",
        organism=organism,
        source_database="GEO",
        technology="MICROARRAY",
        platform_accession_code="hgu133plus2",
    )
    ExperimentSampleAssociation.objects.create(experiment=experiment,
                                               sample=sample)

    # Experiment that isn't even the right source_database.
    experiment = Experiment.objects.create(accession_code="SRP12345",
                                           technology="RNA-SEQ",
                                           source_database="SRA")

    sample = Sample.objects.create(
        accession_code="SRR123145",
        organism=organism,
        source_database="SRA",
        technology="RNA-SEQ",
        platform_accession_code="IlluminaHiSeq1000",
    )
    ExperimentSampleAssociation.objects.create(experiment=experiment,
                                               sample=sample)

    # This second sample is just to make checking things easy because
    # all experiments start with 2 samples in these tests.
    sample = Sample.objects.create(
        accession_code="SRR123146",
        organism=organism,
        source_database="SRA",
        technology="RNA-SEQ",
        platform_accession_code="IlluminaHiSeq1000",
    )
    ExperimentSampleAssociation.objects.create(experiment=experiment,
                                               sample=sample)
コード例 #15
0
ファイル: geo.py プロジェクト: modulexcite/refinebio
    def create_experiment_and_samples_from_api(
            self, experiment_accession_code) -> (Experiment, List[Sample]):
        """ The main surveyor - find the Experiment and Samples from NCBI GEO.

        Uses the GEOParse library, for which docs can be found here: https://geoparse.readthedocs.io/en/latest/usage.html#working-with-geo-objects

        """
        # Cleaning up is tracked here: https://github.com/guma44/GEOparse/issues/41
        gse = GEOparse.get_GEO(experiment_accession_code,
                               destdir=self.get_temp_path(),
                               how="brief",
                               silent=True)
        preprocessed_samples = harmony.preprocess_geo(gse.gsms.items())
        harmonized_samples = harmony.harmonize(preprocessed_samples)

        # Create the experiment object
        try:
            experiment_object = Experiment.objects.get(
                accession_code=experiment_accession_code)
            logger.debug(
                "Experiment %s already exists, skipping object creation.",
                experiment_accession_code,
                survey_job=self.survey_job.id)
        except Experiment.DoesNotExist:
            experiment_object = Experiment()
            experiment_object.accession_code = experiment_accession_code
            experiment_object.source_url = (
                "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=" +
                experiment_accession_code)
            experiment_object.source_database = "GEO"
            experiment_object.title = gse.metadata.get('title', [''])[0]
            experiment_object.description = gse.metadata.get('summary',
                                                             [''])[0]

            # Source doesn't provide time information, assume midnight.
            submission_date = gse.metadata["submission_date"][
                0] + " 00:00:00 UTC"
            experiment_object.source_first_published = dateutil.parser.parse(
                submission_date)
            last_updated_date = gse.metadata["last_update_date"][
                0] + " 00:00:00 UTC"
            experiment_object.source_last_updated = dateutil.parser.parse(
                last_updated_date)

            unique_institutions = list(set(gse.metadata["contact_institute"]))
            experiment_object.submitter_institution = ", ".join(
                unique_institutions)
            experiment_object.pubmed_id = gse.metadata.get("pubmed_id",
                                                           [""])[0]

            # Scrape publication title and authorship from Pubmed
            if experiment_object.pubmed_id:
                pubmed_metadata = utils.get_title_and_authors_for_pubmed_id(
                    experiment_object.pubmed_id)
                experiment_object.publication_title = pubmed_metadata[0]
                experiment_object.publication_authors = pubmed_metadata[1]

            experiment_object.save()

            experiment_annotation = ExperimentAnnotation()
            experiment_annotation.data = gse.metadata
            experiment_annotation.experiment = experiment_object
            experiment_annotation.is_ccdl = False
            experiment_annotation.save()

        # Okay, here's the situation!
        # Sometimes, samples have a direct single representation for themselves.
        # Othertimes, there is a single file with references to every sample in it.
        created_samples = []
        for sample_accession_code, sample in gse.gsms.items():

            try:
                sample_object = Sample.objects.get(
                    accession_code=sample_accession_code)
                logger.debug(
                    "Sample %s from experiment %s already exists, skipping object creation.",
                    sample_accession_code,
                    experiment_object.accession_code,
                    survey_job=self.survey_job.id)

                # Associate it with the experiment, but since it
                # already exists it already has original files
                # associated with it and it's already been downloaded,
                # so don't add it to created_samples.
                ExperimentSampleAssociation.objects.get_or_create(
                    experiment=experiment_object, sample=sample_object)

                ExperimentOrganismAssociation.objects.get_or_create(
                    experiment=experiment_object,
                    organism=sample_object.organism)
            except Sample.DoesNotExist:
                organism = Organism.get_object_for_name(
                    sample.metadata['organism_ch1'][0].upper())

                sample_object = Sample()
                sample_object.source_database = "GEO"
                sample_object.accession_code = sample_accession_code
                sample_object.organism = organism

                # If data processing step, it isn't raw.
                sample_object.has_raw = not sample.metadata.get(
                    'data_processing', None)

                ExperimentOrganismAssociation.objects.get_or_create(
                    experiment=experiment_object, organism=organism)
                sample_object.title = sample.metadata['title'][0]

                self.set_platform_properties(sample_object, sample.metadata,
                                             gse)

                # Directly assign the harmonized properties
                harmonized_sample = harmonized_samples[sample_object.title]
                for key, value in harmonized_sample.items():
                    setattr(sample_object, key, value)

                # Sample-level protocol_info
                sample_object.protocol_info = self.get_sample_protocol_info(
                    sample.metadata, sample_accession_code)

                sample_object.save()
                logger.debug("Created Sample: " + str(sample_object))

                sample_annotation = SampleAnnotation()
                sample_annotation.sample = sample_object
                sample_annotation.data = sample.metadata
                sample_annotation.is_ccdl = False
                sample_annotation.save()

                sample_supplements = sample.metadata.get(
                    'supplementary_file', [])
                for supplementary_file_url in sample_supplements:

                    # Why do they give us this?
                    if supplementary_file_url == "NONE":
                        break

                    # We never want these!
                    if "idat.gz" in supplementary_file_url.lower():
                        continue
                    if "chp.gz" in supplementary_file_url.lower():
                        continue
                    if "ndf.gz" in supplementary_file_url.lower():
                        continue
                    if "pos.gz" in supplementary_file_url.lower():
                        continue
                    if "pair.gz" in supplementary_file_url.lower():
                        continue
                    if "gff.gz" in supplementary_file_url.lower():
                        continue

                    # Sometimes, we are lied to about the data processing step.
                    lower_file_url = supplementary_file_url.lower()
                    if '.cel' in lower_file_url \
                    or ('_non_normalized.txt' in lower_file_url) \
                    or ('_non-normalized.txt' in lower_file_url) \
                    or ('-non-normalized.txt' in lower_file_url) \
                    or ('-non_normalized.txt' in lower_file_url):
                        sample_object.has_raw = True
                        sample_object.save()

                    # filename and source_filename are the same for these
                    filename = supplementary_file_url.split('/')[-1]
                    original_file = OriginalFile.objects.get_or_create(
                        source_url=supplementary_file_url,
                        filename=filename,
                        source_filename=filename,
                        has_raw=sample_object.has_raw,
                        is_archive=True)[0]

                    logger.debug("Created OriginalFile: " + str(original_file))

                    original_file_sample_association = OriginalFileSampleAssociation.objects.get_or_create(
                        original_file=original_file, sample=sample_object)

                    if original_file.is_affy_data():
                        # Only Affymetrix Microarrays produce .CEL files
                        sample_object.technology = 'MICROARRAY'
                        sample_object.manufacturer = 'AFFYMETRTIX'
                        sample_object.save()

                # It's okay to survey RNA-Seq samples from GEO, but we
                # don't actually want to download/process any RNA-Seq
                # data unless it comes from SRA.
                if sample_object.technology != 'RNA-SEQ':
                    created_samples.append(sample_object)

                # Now that we've determined the technology at the
                # sample level, we can set it at the experiment level,
                # just gotta make sure to only do it once. There can
                # be more than one technology, this should be changed
                # as part of:
                # https://github.com/AlexsLemonade/refinebio/issues/1099
                if not experiment_object.technology:
                    experiment_object.technology = sample_object.technology
                    experiment_object.save()

                ExperimentSampleAssociation.objects.get_or_create(
                    experiment=experiment_object, sample=sample_object)

        # These supplementary files _may-or-may-not_ contain the type of raw data we can process.
        for experiment_supplement_url in gse.metadata.get(
                'supplementary_file', []):

            # filename and source_filename are the same for these
            filename = experiment_supplement_url.split('/')[-1]
            original_file = OriginalFile.objects.get_or_create(
                source_url=experiment_supplement_url,
                filename=filename,
                source_filename=filename,
                has_raw=sample_object.has_raw,
                is_archive=True)[0]

            logger.debug("Created OriginalFile: " + str(original_file))

            lower_supplement_url = experiment_supplement_url.lower()
            if ('_non_normalized.txt' in lower_supplement_url) \
            or ('_non-normalized.txt' in lower_supplement_url) \
            or ('-non-normalized.txt' in lower_supplement_url) \
            or ('-non_normalized.txt' in lower_supplement_url):
                for sample_object in created_samples:
                    sample_object.has_raw = True
                    sample_object.save()

                    OriginalFileSampleAssociation.objects.get_or_create(
                        sample=sample_object, original_file=original_file)

            # Delete this Original file if it isn't being used.
            if OriginalFileSampleAssociation.objects.filter(
                    original_file=original_file).count() == 0:
                original_file.delete()

        # These are the Miniml/Soft/Matrix URLs that are always(?) provided.
        # GEO describes different types of data formatting as "families"
        family_url = self.get_miniml_url(experiment_accession_code)
        miniml_original_file = OriginalFile.objects.get_or_create(
            source_url=family_url,
            source_filename=family_url.split('/')[-1],
            has_raw=sample_object.has_raw,
            is_archive=True)[0]
        for sample_object in created_samples:
            # We don't need a .txt if we have a .CEL
            if sample_object.has_raw:
                continue
            OriginalFileSampleAssociation.objects.get_or_create(
                sample=sample_object, original_file=miniml_original_file)

        # Delete this Original file if it isn't being used.
        if OriginalFileSampleAssociation.objects.filter(
                original_file=miniml_original_file).count() == 0:
            miniml_original_file.delete()

        # Trash the temp path
        try:
            shutil.rmtree(self.get_temp_path())
        except Exception:
            # There was a problem during surveying so this didn't get created.
            # It's not a big deal.
            pass

        return experiment_object, created_samples
コード例 #16
0
    def test_create_compendia(self):
        job = ProcessorJob()
        job.pipeline_applied = "COMPENDIA"
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1487313"
        experiment.save()

        result = ComputationalResult()
        result.save()

        gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS")

        sample = Sample()
        sample.accession_code = 'GSM1487313'
        sample.title = 'GSM1487313'
        sample.organism = gallus_gallus
        sample.technology = "MICROARRAY"
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1487313_liver.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        # RNASEQ TECH
        experiment2 = Experiment()
        experiment2.accession_code = "SRS332914"
        experiment2.save()

        result2 = ComputationalResult()
        result2.save()

        sample2 = Sample()
        sample2.accession_code = 'SRS332914'
        sample2.title = 'SRS332914'
        sample2.organism = gallus_gallus
        sample2.technology = "RNA-SEQ"
        sample2.save()

        sra2 = SampleResultAssociation()
        sra2.sample = sample2
        sra2.result = result2
        sra2.save()

        esa2 = ExperimentSampleAssociation()
        esa2.experiment = experiment2
        esa2.sample = sample2
        esa2.save()

        computed_file2 = ComputedFile()
        computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv"
        computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename
        computed_file2.result = result2
        computed_file2.size_in_bytes = 234
        computed_file2.is_smashable = True
        computed_file2.save()

        assoc2 = SampleComputedFileAssociation()
        assoc2.sample = sample2
        assoc2.computed_file = computed_file2
        assoc2.save()

        dset = Dataset()
        dset.data = {'GSE1487313': ['GSM1487313'], 'SRX332914': ['SRS332914']}
        dset.scale_by = 'NONE'
        dset.aggregate_by = 'SPECIES'
        dset.quantile_normalize = False
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        final_context = create_compendia.create_compendia(job.id)
コード例 #17
0
    def test_create_compendia_danio(self):
        job = ProcessorJob()
        job.pipeline_applied = "COMPENDIA"
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1234"
        experiment.save()

        result = ComputationalResult()
        result.save()

        danio_rerio = Organism.get_object_for_name("DANIO_RERIO")

        micros = []
        for file in os.listdir('/home/user/data_store/raw/TEST/MICROARRAY/'):

            if 'microarray.txt' in file:
                continue

            sample = Sample()
            sample.accession_code = file
            sample.title = file
            sample.organism = danio_rerio
            sample.technology = "MICROARRAY"
            sample.save()

            sra = SampleResultAssociation()
            sra.sample = sample
            sra.result = result
            sra.save()

            esa = ExperimentSampleAssociation()
            esa.experiment = experiment
            esa.sample = sample
            esa.save()

            computed_file = ComputedFile()
            computed_file.filename = file
            computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/MICROARRAY/" + file
            computed_file.result = result
            computed_file.size_in_bytes = 123
            computed_file.is_smashable = True
            computed_file.save()

            assoc = SampleComputedFileAssociation()
            assoc.sample = sample
            assoc.computed_file = computed_file
            assoc.save()

            micros.append(file)

        experiment = Experiment()
        experiment.accession_code = "GSE5678"
        experiment.save()

        result = ComputationalResult()
        result.save()
        rnas = []
        for file in os.listdir('/home/user/data_store/raw/TEST/RNASEQ/'):

            if 'rnaseq.txt' in file:
                continue

            sample = Sample()
            sample.accession_code = file
            sample.title = file
            sample.organism = danio_rerio
            sample.technology = "RNASEQ"
            sample.save()

            sra = SampleResultAssociation()
            sra.sample = sample
            sra.result = result
            sra.save()

            esa = ExperimentSampleAssociation()
            esa.experiment = experiment
            esa.sample = sample
            esa.save()

            computed_file = ComputedFile()
            computed_file.filename = file
            computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/RNASEQ/" + file
            computed_file.result = result
            computed_file.size_in_bytes = 123
            computed_file.is_smashable = True
            computed_file.save()

            assoc = SampleComputedFileAssociation()
            assoc.sample = sample
            assoc.computed_file = computed_file
            assoc.save()

            rnas.append(file)

        result = ComputationalResult()
        result.save()

        qn_target = ComputedFile()
        qn_target.filename = "danio_target.tsv"
        qn_target.absolute_file_path = '/home/user/data_store/QN/danio_target.tsv'
        qn_target.is_qn_target = True
        qn_target.size_in_bytes = "12345"
        qn_target.sha1 = "aabbccddeeff"
        qn_target.result = result
        qn_target.save()

        cra = ComputationalResultAnnotation()
        cra.data = {}
        cra.data['organism_id'] = danio_rerio.id
        cra.data['is_qn'] = True
        cra.result = result
        cra.save()

        dset = Dataset()
        dset.data = {'GSE1234': micros, 'GSE5678': rnas}
        dset.scale_by = 'NONE'
        dset.aggregate_by = 'SPECIES'
        dset.quantile_normalize = False
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        final_context = create_compendia.create_compendia(job.id)

        # Verify result
        self.assertEqual(len(final_context['computed_files']), 3)
        for file in final_context['computed_files']:
            self.assertTrue(os.path.exists(file.absolute_file_path))
コード例 #18
0
ファイル: array_express.py プロジェクト: Quiltomics/refinebio
    def create_samples_from_api(self, experiment: Experiment,
                                platform_dict: Dict) -> List[Sample]:
        """Generates a Sample item for each sample in an AE experiment.

        There are many possible data situations for a sample:

            - If the sample only has raw data available:
                - If it is on a platform that we support:
                    Download this raw data and process it
                - If it is not on a platform we support:
                    Don't download anything, don't process anything
            - If the sample has both raw and derived data:
                - If the raw data is on a platform we support:
                    Download the raw data and process it, abandon the derived data
                - If the raw data is not on a platform we support
                    Download the derived data and no-op it, abandon the raw data
            - If the sample only has derived data:
                Download the derived data and no-op it.

        See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/samples

        """

        created_samples = []

        samples_endpoint = SAMPLES_URL.format(experiment.accession_code)
        r = utils.requests_retry_session().get(samples_endpoint, timeout=60)
        samples = r.json()["experiment"]["sample"]

        # The SDRF is the complete metadata record on a sample/property basis.
        # We run this through our harmonizer and then attach the properties
        # to our created samples.
        SDRF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt"
        sdrf_url = SDRF_URL_TEMPLATE.format(code=experiment.accession_code)
        sdrf_samples = harmony.parse_sdrf(sdrf_url)
        harmonized_samples = harmony.harmonize(sdrf_samples)

        # An experiment can have many samples
        for sample_data in samples:

            # For some reason, this sample has no files associated with it.
            if "file" not in sample_data or len(sample_data['file']) == 0:
                continue

            # Each sample is given an experimenatlly-unique title.
            flat_sample = utils.flatten(sample_data)
            title = harmony.extract_title(flat_sample)

            # A sample may actually have many sub files.
            # If there is raw data, take that.
            # If not, take the derived.
            has_raw = False
            for sub_file in sample_data['file']:

                # For ex: E-GEOD-15645
                if isinstance(sub_file['comment'], list):
                    sub_file_mod = sub_file
                    sub_file_mod['comment'] = sub_file['comment'][0]
                else:
                    sub_file_mod = sub_file

                # Some have the 'data' field, but not the actual data
                # Ex: E-GEOD-9656
                if sub_file_mod['type'] == "data" and sub_file_mod[
                        'comment'].get('value', None) != None:
                    has_raw = True
                if 'raw' in sub_file_mod['comment'].get('value', ''):
                    has_raw = True

            skip_sample = False
            for sub_file in sample_data['file']:

                # Don't get the raw data if it's only a 1-color sample.
                if 'Cy3' in str(sample_data) and 'Cy5' not in str(sample_data):
                    has_raw = False

                # Skip derived data if we have it raw.
                if has_raw and "derived data" in sub_file['type']:
                    continue

                download_url = None
                filename = sub_file["name"]

                # sub_file["comment"] is only a list if there's
                # more than one comment...
                comments = sub_file["comment"]
                if isinstance(comments, list):
                    # Could be: "Derived ArrayExpress Data Matrix FTP
                    # file" or: "ArrayExpress FTP file". If there is
                    # no comment with a name including "FTP file" then
                    # we don't know where to download it so we need to
                    # mark this job as an error. Therefore don't catch
                    # the potential exception where download_url
                    # doesn't get defined.
                    for comment in comments:
                        if "FTP file" in comment["name"]:
                            download_url = comment["value"]
                            break
                else:
                    download_url = comments["value"]

                if not download_url:
                    logger.error(
                        "Sample %s did not specify a download url, skipping.",
                        sample_accession_code,
                        experiment_accession_code=experiment.accession_code,
                        survey_job=self.survey_job.id,
                        sub_file=sub_file)
                    skip_sample = True
                    continue

                if not filename:
                    logger.error(
                        "Sample %s did not specify a filename, skipping.",
                        sample_accession_code,
                        experiment_accession_code=experiment.accession_code,
                        survey_job=self.survey_job.id,
                        sub_file=sub_file)
                    skip_sample = True
                    continue

            if skip_sample:
                continue

            # The accession code is not a simple matter to determine.
            sample_source_name = sample_data["source"].get("name", "")
            sample_assay_name = sample_data["assay"].get("name", "")
            sample_accession_code = self.determine_sample_accession(
                experiment.accession_code, sample_source_name,
                sample_assay_name, filename)

            # Figure out the Organism for this sample
            organism_name = UNKNOWN
            for characteristic in sample_data["characteristic"]:
                if characteristic["category"].upper() == "ORGANISM":
                    organism_name = characteristic["value"].upper()

            if organism_name == UNKNOWN:
                logger.error(
                    "Sample %s did not specify the organism name.",
                    sample_accession_code,
                    experiment_accession_code=experiment.accession_code,
                    survey_job=self.survey_job.id)
                organism = None
                continue
            else:
                organism = Organism.get_object_for_name(organism_name)

            # Create the sample object
            try:
                # Associate it with the experiment, but since it
                # already exists it already has original files
                # associated with it and it's already been downloaded,
                # so don't add it to created_samples.
                sample_object = Sample.objects.get(
                    accession_code=sample_accession_code)

                # If input experiment includes new protocol information,
                # update sample's protocol_info.
                existing_protocols = sample_object.protocol_info
                protocol_info, is_updated = self.update_sample_protocol_info(
                    existing_protocols, experiment.protocol_description,
                    experiment.source_url + '/protocols')
                if is_updated:
                    sample_object.protocol_info = protocol_info
                    sample_obejct.save()

                logger.debug(
                    "Sample %s already exists, skipping object creation.",
                    sample_accession_code,
                    experiment_accession_code=experiment.accession_code,
                    survey_job=self.survey_job.id)
            except Sample.DoesNotExist:
                sample_object = Sample()

                # The basics
                sample_object.source_database = "ARRAY_EXPRESS"
                sample_object.title = title
                sample_object.accession_code = sample_accession_code
                sample_object.source_archive_url = samples_endpoint
                sample_object.organism = organism
                sample_object.platform_name = platform_dict[
                    "platform_accession_name"]
                sample_object.platform_accession_code = platform_dict[
                    "platform_accession_code"]
                sample_object.manufacturer = platform_dict["manufacturer"]
                sample_object.technology = "MICROARRAY"

                protocol_info, is_updated = self.update_sample_protocol_info(
                    existing_protocols=[],
                    experiment_protocol=experiment.protocol_description,
                    protocol_url=experiment.source_url + '/protocols')
                # Do not check is_updated the first time because we must
                # save a list so we can append to it later.
                sample_object.protocol_info = protocol_info

                sample_object.save()

                # Directly assign the harmonized properties
                harmonized_sample = harmonized_samples[title]
                for key, value in harmonized_sample.items():
                    setattr(sample_object, key, value)
                sample_object.save()

                sample_annotation = SampleAnnotation()
                sample_annotation.data = sample_data
                sample_annotation.sample = sample_object
                sample_annotation.is_ccdl = False
                sample_annotation.save()

                original_file = OriginalFile()
                original_file.filename = filename
                original_file.source_filename = filename
                original_file.source_url = download_url
                original_file.is_downloaded = False
                original_file.is_archive = True
                original_file.has_raw = has_raw
                original_file.save()

                original_file_sample_association = OriginalFileSampleAssociation(
                )
                original_file_sample_association.original_file = original_file
                original_file_sample_association.sample = sample_object
                original_file_sample_association.save()

                created_samples.append(sample_object)

                logger.debug(
                    "Created " + str(sample_object),
                    experiment_accession_code=experiment.accession_code,
                    survey_job=self.survey_job.id,
                    sample=sample_object.id)

            # Create associations if they don't already exist
            ExperimentSampleAssociation.objects.get_or_create(
                experiment=experiment, sample=sample_object)

            ExperimentOrganismAssociation.objects.get_or_create(
                experiment=experiment, organism=organism)

        return created_samples
コード例 #19
0
ファイル: test_smasher.py プロジェクト: Quiltomics/refinebio
    def test_bad_overlap(self):

        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        experiment = Experiment()
        experiment.accession_code = "GSE51081"
        experiment.save()

        result = ComputationalResult()
        result.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

        sample = Sample()
        sample.accession_code = 'GSM1237810'
        sample.title = 'GSM1237810'
        sample.organism = homo_sapiens
        sample.save()

        sample_annotation = SampleAnnotation()
        sample_annotation.data = {'hi': 'friend'}
        sample_annotation.sample = sample
        sample_annotation.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "big.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sample = Sample()
        sample.accession_code = 'GSM1237812'
        sample.title = 'GSM1237812'
        sample.organism = homo_sapiens
        sample.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        computed_file = ComputedFile()
        computed_file.filename = "small.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812']}
        ds.aggregate_by = 'ALL'  # [ALL or SPECIES or EXPERIMENT]
        ds.scale_by = 'NONE'  # [NONE or MINMAX or STANDARD or ROBUST]
        ds.email_address = "*****@*****.**"
        #ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(pj.pk, upload=False)
        ds = Dataset.objects.get(id=ds.id)

        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        # Now, make sure the bad can't zero this out.
        sample = Sample()
        sample.accession_code = 'GSM999'
        sample.title = 'GSM999'
        sample.organism = homo_sapiens
        sample.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        computed_file = ComputedFile()
        computed_file.filename = "bad.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812', 'GSM999']}
        ds.aggregate_by = 'ALL'  # [ALL or SPECIES or EXPERIMENT]
        ds.scale_by = 'NONE'  # [NONE or MINMAX or STANDARD or ROBUST]
        ds.email_address = "*****@*****.**"
        #ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(pj.pk, upload=False)
        ds = Dataset.objects.get(id=ds.id)

        self.assertEqual(len(final_context['final_frame']), 4)
コード例 #20
0
ファイル: create_compendia.py プロジェクト: erflynn/refinebio
def _perform_imputation(job_context: Dict) -> Dict:
    """

    Take the inputs and perform the primary imputation.

    Via https://github.com/AlexsLemonade/refinebio/issues/508#issuecomment-435879283:
     - Combine all microarray samples with a full join to form a
       microarray_expression_matrix (this may end up being a DataFrame).
     - Combine all RNA-seq samples (lengthScaledTPM) with a full outer join
       to form a rnaseq_expression_matrix.
     - Calculate the sum of the lengthScaledTPM values for each row (gene) of
       the rnaseq_expression_matrix (rnaseq_row_sums).
     - Calculate the 10th percentile of rnaseq_row_sums
     - Drop all rows in rnaseq_expression_matrix with a row sum < 10th percentile of
       rnaseq_row_sums; this is now filtered_rnaseq_matrix
     - log2(x + 1) transform filtered_rnaseq_matrix; this is now log2_rnaseq_matrix
     - Set all zero values in log2_rnaseq_matrix to NA, but make sure to keep track of
       where these zeroes are
     - Perform a full outer join of microarray_expression_matrix and
       log2_rnaseq_matrix; combined_matrix
     - Remove genes (rows) with >30% missing values in combined_matrix
     - Remove samples (columns) with >50% missing values in combined_matrix
     - "Reset" zero values that were set to NA in RNA-seq samples (i.e., make these zero
       again) in combined_matrix
     - Transpose combined_matrix; transposed_matrix
     - Perform imputation of missing values with IterativeSVD (rank=10) on
       the transposed_matrix; imputed_matrix
        -- with specified svd algorithm or skip
     - Untranspose imputed_matrix (genes are now rows, samples are now columns)
     - Quantile normalize imputed_matrix where genes are rows and samples are columns

    """
    imputation_start = log_state("start perform imputation",
                                 job_context["job"].id)
    job_context["time_start"] = timezone.now()
    rnaseq_row_sums_start = log_state("start rnaseq row sums",
                                      job_context["job"].id)

    # We potentially can have a microarray-only compendia but not a RNASeq-only compendia
    log2_rnaseq_matrix = None
    if job_context["rnaseq_matrix"] is not None:
        # Drop any genes that are entirely NULL in the RNA-Seq matrix
        job_context["rnaseq_matrix"] = job_context["rnaseq_matrix"].dropna(
            axis="columns", how="all")

        # Calculate the sum of the lengthScaledTPM values for each row
        # (gene) of the rnaseq_matrix (rnaseq_row_sums)
        rnaseq_row_sums = np.sum(job_context["rnaseq_matrix"], axis=1)

        log_state("end rnaseq row sums", job_context["job"].id,
                  rnaseq_row_sums_start)
        rnaseq_decile_start = log_state("start rnaseq decile",
                                        job_context["job"].id)

        # Calculate the 10th percentile of rnaseq_row_sums
        rnaseq_tenth_percentile = np.percentile(rnaseq_row_sums, 10)

        log_state("end rnaseq decile", job_context["job"].id,
                  rnaseq_decile_start)
        drop_start = log_state("drop all rows", job_context["job"].id)
        # Drop all rows in rnaseq_matrix with a row sum < 10th
        # percentile of rnaseq_row_sums; this is now
        # filtered_rnaseq_matrix
        # TODO: This is probably a better way to do this with `np.where`
        rows_to_filter = []
        for (x, sum_val) in rnaseq_row_sums.items():
            if sum_val < rnaseq_tenth_percentile:
                rows_to_filter.append(x)

        del rnaseq_row_sums

        log_state("actually calling drop()", job_context["job"].id)

        filtered_rnaseq_matrix = job_context.pop("rnaseq_matrix").drop(
            rows_to_filter)

        del rows_to_filter

        log_state("end drop all rows", job_context["job"].id, drop_start)
        log2_start = log_state("start log2", job_context["job"].id)

        # log2(x + 1) transform filtered_rnaseq_matrix; this is now log2_rnaseq_matrix
        filtered_rnaseq_matrix_plus_one = filtered_rnaseq_matrix + 1
        log2_rnaseq_matrix = np.log2(filtered_rnaseq_matrix_plus_one)
        del filtered_rnaseq_matrix_plus_one
        del filtered_rnaseq_matrix

        log_state("end log2", job_context["job"].id, log2_start)
        cache_start = log_state("start caching zeroes", job_context["job"].id)

        # Cache our RNA-Seq zero values
        cached_zeroes = {}
        for column in log2_rnaseq_matrix.columns:
            cached_zeroes[column] = log2_rnaseq_matrix.index[np.where(
                log2_rnaseq_matrix[column] == 0)]

        # Set all zero values in log2_rnaseq_matrix to NA, but make sure
        # to keep track of where these zeroes are
        log2_rnaseq_matrix[log2_rnaseq_matrix == 0] = np.nan

        log_state("end caching zeroes", job_context["job"].id, cache_start)

    outer_merge_start = log_state("start outer merge", job_context["job"].id)

    # Perform a full outer join of microarray_matrix and
    # log2_rnaseq_matrix; combined_matrix
    if log2_rnaseq_matrix is not None:
        combined_matrix = job_context.pop("microarray_matrix").merge(
            log2_rnaseq_matrix, how="outer", left_index=True, right_index=True)
    else:
        logger.info("Building compendia with only microarray data.",
                    job_id=job_context["job"].id)
        combined_matrix = job_context.pop("microarray_matrix")

    log_state("ran outer merge, now deleteing log2_rnaseq_matrix",
              job_context["job"].id)

    del log2_rnaseq_matrix

    log_state("end outer merge", job_context["job"].id, outer_merge_start)
    drop_na_genes_start = log_state("start drop NA genes",
                                    job_context["job"].id)

    # # Visualize Prefiltered
    # output_path = job_context['output_dir'] + "pre_filtered_" + str(time.time()) + ".png"
    # visualized_prefilter = visualize.visualize(combined_matrix.copy(), output_path)

    # Remove genes (rows) with <=70% present values in combined_matrix
    thresh = combined_matrix.shape[1] * 0.7  # (Rows, Columns)
    # Everything below `thresh` is dropped
    row_filtered_matrix = combined_matrix.dropna(axis="index", thresh=thresh)

    del combined_matrix
    del thresh

    log_state("end drop NA genes", job_context["job"].id, drop_na_genes_start)
    drop_na_samples_start = log_state("start drop NA samples",
                                      job_context["job"].id)

    # # Visualize Row Filtered
    # output_path = job_context['output_dir'] + "row_filtered_" + str(time.time()) + ".png"
    # visualized_rowfilter = visualize.visualize(row_filtered_matrix.copy(), output_path)

    # Remove samples (columns) with <50% present values in combined_matrix
    # XXX: Find better test data for this!
    col_thresh = row_filtered_matrix.shape[0] * 0.5
    row_col_filtered_matrix_samples = row_filtered_matrix.dropna(
        axis="columns", thresh=col_thresh)
    row_col_filtered_matrix_samples_index = row_col_filtered_matrix_samples.index
    row_col_filtered_matrix_samples_columns = row_col_filtered_matrix_samples.columns

    log_state("end drop NA genes", job_context["job"].id,
              drop_na_samples_start)
    replace_zeroes_start = log_state("start replace zeroes",
                                     job_context["job"].id)

    for sample_accession_code in row_filtered_matrix.columns:
        if sample_accession_code not in row_col_filtered_matrix_samples_columns:
            sample = Sample.objects.get(accession_code=sample_accession_code)
            sample_metadata = sample.to_metadata_dict()
            job_context["filtered_samples"][sample_accession_code] = {
                **sample_metadata,
                "reason":
                "Sample was dropped because it had less than 50% present values.",
                "experiment_accession_code":
                smashing_utils.get_experiment_accession(
                    sample.accession_code, job_context["dataset"].data),
            }

    del row_filtered_matrix

    # # Visualize Row and Column Filtered
    # output_path = job_context['output_dir'] + "row_col_filtered_" + str(time.time()) + ".png"
    # visualized_rowcolfilter = visualize.visualize(row_col_filtered_matrix_samples.copy(),
    #                                               output_path)

    # "Reset" zero values that were set to NA in RNA-seq samples
    # (i.e., make these zero again) in combined_matrix
    for column in cached_zeroes.keys():
        zeroes = cached_zeroes[column]

        # Skip purged columns
        if column not in row_col_filtered_matrix_samples:
            continue

        # Place the zero
        try:
            # This generates a warning, so use loc[] instead
            # row_col_filtered_matrix_samples[column].replace(zeroes, 0.0, inplace=True)
            zeroes_list = zeroes.tolist()
            new_index_list = row_col_filtered_matrix_samples_index.tolist()
            new_zeroes = list(set(new_index_list) & set(zeroes_list))
            row_col_filtered_matrix_samples[column].loc[new_zeroes] = 0.0
        except Exception:
            logger.warn("Error when replacing zero")
            continue

    log_state("end replace zeroes", job_context["job"].id,
              replace_zeroes_start)
    transposed_zeroes_start = log_state("start replacing transposed zeroes",
                                        job_context["job"].id)

    # Label our new replaced data
    combined_matrix_zero = row_col_filtered_matrix_samples
    del row_col_filtered_matrix_samples

    transposed_matrix_with_zeros = combined_matrix_zero.T
    del combined_matrix_zero

    # Remove -inf and inf
    # This should never happen, but make sure it doesn't!
    transposed_matrix = transposed_matrix_with_zeros.replace([np.inf, -np.inf],
                                                             np.nan)
    del transposed_matrix_with_zeros

    log_state("end replacing transposed zeroes", job_context["job"].id,
              transposed_zeroes_start)

    # Store the absolute/percentages of imputed values
    matrix_sum = transposed_matrix.isnull().sum()
    percent = (matrix_sum /
               transposed_matrix.isnull().count()).sort_values(ascending=False)
    total_percent_imputed = sum(percent) / len(transposed_matrix.count())
    job_context["total_percent_imputed"] = total_percent_imputed
    logger.info("Total percentage of data to impute!",
                total_percent_imputed=total_percent_imputed)

    # Perform imputation of missing values with IterativeSVD (rank=10) on the
    # transposed_matrix; imputed_matrix
    svd_algorithm = job_context["dataset"].svd_algorithm
    if svd_algorithm != "NONE":
        svd_start = log_state("start SVD", job_context["job"].id)

        logger.info("IterativeSVD algorithm: %s" % svd_algorithm)
        svd_algorithm = str.lower(svd_algorithm)
        imputed_matrix = IterativeSVD(
            rank=10,
            svd_algorithm=svd_algorithm).fit_transform(transposed_matrix)

        svd_start = log_state("end SVD", job_context["job"].id, svd_start)
    else:
        imputed_matrix = transposed_matrix
        logger.info("Skipping IterativeSVD")
    del transposed_matrix

    untranspose_start = log_state("start untranspose", job_context["job"].id)

    # Untranspose imputed_matrix (genes are now rows, samples are now columns)
    untransposed_imputed_matrix = imputed_matrix.T
    del imputed_matrix

    # Convert back to Pandas
    untransposed_imputed_matrix_df = pd.DataFrame.from_records(
        untransposed_imputed_matrix)
    untransposed_imputed_matrix_df.index = row_col_filtered_matrix_samples_index
    untransposed_imputed_matrix_df.columns = row_col_filtered_matrix_samples_columns
    del untransposed_imputed_matrix
    del row_col_filtered_matrix_samples_index
    del row_col_filtered_matrix_samples_columns
    # Quantile normalize imputed_matrix where genes are rows and samples are columns
    job_context["organism"] = Organism.get_object_for_name(
        job_context["organism_name"])
    job_context["merged_no_qn"] = untransposed_imputed_matrix_df
    # output_path = job_context['output_dir'] + "compendia_no_qn_" + str(time.time()) + ".png"
    # visualized_merged_no_qn = visualize.visualize(untransposed_imputed_matrix_df.copy(),
    #                                               output_path)

    log_state("end untranspose", job_context["job"].id, untranspose_start)
    quantile_start = log_state("start quantile normalize",
                               job_context["job"].id)

    # Perform the Quantile Normalization
    job_context = smashing_utils.quantile_normalize(job_context,
                                                    ks_check=False)

    log_state("end quantile normalize", job_context["job"].id, quantile_start)

    # Visualize Final Compendia
    # output_path = job_context['output_dir'] + "compendia_with_qn_" + str(time.time()) + ".png"
    # visualized_merged_qn = visualize.visualize(job_context['merged_qn'].copy(), output_path)

    job_context["time_end"] = timezone.now()
    job_context["formatted_command"] = ["create_compendia.py"]
    log_state("end prepare imputation", job_context["job"].id,
              imputation_start)
    return job_context
コード例 #21
0
ファイル: test_smasher.py プロジェクト: Quiltomics/refinebio
    def test_no_smash_all_diff_species(self):
        """ Smashing together with 'ALL' with different species is a really weird behavior. 
        This test isn't really testing a normal case, just make sure that it's marking the
        unsmashable files.
        """

        job = ProcessorJob()
        job.pipeline_applied = "SMASHER"
        job.save()

        experiment = Experiment()
        experiment.accession_code = "GSE51081"
        experiment.save()

        result = ComputationalResult()
        result.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

        sample = Sample()
        sample.accession_code = 'GSM1237810'
        sample.title = 'GSM1237810'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1237810_T09-1084.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        result = ComputationalResult()
        result.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        experiment = Experiment()
        experiment.accession_code = "GSE51084"
        experiment.save()

        mus_mus = Organism.get_object_for_name("MUS_MUSCULUS")

        sample = Sample()
        sample.accession_code = 'GSM1238108'
        sample.title = 'GSM1238108'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1238108-tbl-1.txt"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE51081': ['GSM1237810'], 'GSE51084': ['GSM1238108']}
        ds.aggregate_by = 'ALL'
        ds.scale_by = 'STANDARD'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(job.pk, upload=False)

        dsid = ds.id
        ds = Dataset.objects.get(id=dsid)
        print(ds.failure_reason)
        print(final_context['dataset'].failure_reason)

        self.assertEqual(final_context['unsmashable_files'], ['GSM1238108'])
コード例 #22
0
ファイル: create_compendia.py プロジェクト: erflynn/refinebio
def _create_result_objects(job_context: Dict) -> Dict:
    """
    Store and host the result as a ComputationalResult object.
    """
    result_start = log_state("start create result object",
                             job_context["job"].id)
    result = ComputationalResult()
    result.commands.append(" ".join(job_context["formatted_command"]))
    result.is_ccdl = True
    # Temporary until we re-enable the QN test step.
    result.is_public = False
    result.time_start = job_context["time_start"]
    result.time_end = job_context["time_end"]
    try:
        processor_key = "CREATE_COMPENDIA"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)
    result.save()

    # Write the compendia dataframe to a file
    job_context["csv_outfile"] = job_context["output_dir"] + job_context[
        "organism_name"] + ".tsv"
    job_context["merged_qn"].to_csv(job_context["csv_outfile"],
                                    sep="\t",
                                    encoding="utf-8")

    organism_key = list(job_context["samples"].keys())[0]
    annotation = ComputationalResultAnnotation()
    annotation.result = result

    annotation.data = {
        "organism_id":
        job_context["samples"][organism_key][0].organism_id,
        "organism_name":
        job_context["organism_name"],
        "is_qn":
        False,
        "is_compendia":
        True,
        "samples": [
            sample.accession_code
            for sample in job_context["samples"][organism_key]
        ],
        "num_samples":
        len(job_context["samples"][organism_key]),
        "experiment_accessions":
        [e.accession_code for e in job_context["experiments"]],
        "total_percent_imputed":
        job_context["total_percent_imputed"],
    }
    annotation.save()

    # Create the resulting archive
    final_zip_base = SMASHING_DIR + str(
        job_context["dataset"].pk) + "_compendia"
    # Copy LICENSE.txt and correct README.md files.
    if job_context["dataset"].quant_sf_only:
        readme_file = "/home/user/README_QUANT.md"
    else:
        readme_file = "/home/user/README_NORMALIZED.md"

    shutil.copy(readme_file, job_context["output_dir"] + "/README.md")
    shutil.copy("/home/user/LICENSE_DATASET.txt",
                job_context["output_dir"] + "/LICENSE.TXT")
    archive_path = shutil.make_archive(final_zip_base, "zip",
                                       job_context["output_dir"])

    archive_computed_file = ComputedFile()
    archive_computed_file.absolute_file_path = archive_path
    archive_computed_file.filename = archive_path.split("/")[-1]
    archive_computed_file.calculate_sha1()
    archive_computed_file.calculate_size()
    archive_computed_file.is_smashable = False
    archive_computed_file.is_qn_target = False
    archive_computed_file.result = result
    archive_computed_file.save()

    # Compendia Result Helpers
    primary_organism = Organism.get_object_for_name(
        job_context["primary_organism"])
    organisms = [
        Organism.get_object_for_name(organism)
        for organism in job_context["all_organisms"]
    ]
    compendium_version = (CompendiumResult.objects.filter(
        primary_organism=primary_organism, quant_sf_only=False).count() + 1)
    # Save Compendia Result
    compendium_result = CompendiumResult()
    compendium_result.quant_sf_only = job_context["dataset"].quant_sf_only
    compendium_result.svd_algorithm = job_context["dataset"].svd_algorithm
    compendium_result.compendium_version = compendium_version
    compendium_result.result = result
    compendium_result.primary_organism = primary_organism
    compendium_result.save()

    # create relations to all organisms contained in the compendia

    compendium_result_organism_associations = []
    for compendium_organism in organisms:
        compendium_result_organism_association = CompendiumResultOrganismAssociation(
        )
        compendium_result_organism_association.compendium_result = compendium_result
        compendium_result_organism_association.organism = compendium_organism
        compendium_result_organism_associations.append(
            compendium_result_organism_association)

    CompendiumResultOrganismAssociation.objects.bulk_create(
        compendium_result_organism_associations)

    job_context["compendium_result"] = compendium_result

    logger.info("Compendium created!",
                archive_path=archive_path,
                organism_name=job_context["organism_name"])

    # Upload the result to S3
    timestamp = str(int(time.time()))
    key = job_context["organism_name"] + "_" + str(
        compendium_version) + "_" + timestamp + ".zip"
    uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME,
                                                      key)

    if not uploaded_to_s3:
        raise utils.ProcessorJobError(
            "Failed to upload compendia to S3",
            success=False,
            computed_file_id=archive_computed_file.id,
        )

    if settings.RUNNING_IN_CLOUD:
        archive_computed_file.delete_local_file()

    job_context["result"] = result
    job_context["success"] = True

    log_state("end create result object", job_context["job"].id, result_start)

    # TEMPORARY for iterating on compendia more quickly.
    # Reset this so the end_job does clean up the job's non-input-data stuff.
    job_context["work_dir"] = job_context["old_work_dir"]

    return job_context
コード例 #23
0
ファイル: test_smasher.py プロジェクト: Quiltomics/refinebio
    def test_no_smash_dupe_two(self):
        """ Tests the SRP051449 case, where the titles collide. Also uses a real QN target file."""

        job = ProcessorJob()
        job.pipeline_applied = "SMASHER"
        job.save()

        experiment = Experiment()
        experiment.accession_code = "SRP051449"
        experiment.save()

        result = ComputationalResult()
        result.save()

        danio_rerio = Organism.get_object_for_name("DANIO_RERIO")

        sample = Sample()
        sample.accession_code = 'SRR1731761'
        sample.title = 'Danio rerio'
        sample.organism = danio_rerio
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "SRR1731761_output_gene_lengthScaledTPM.tsv"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        result = ComputationalResult()
        result.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sample = Sample()
        sample.accession_code = 'SRR1731762'
        sample.title = 'Danio rerio'
        sample.organism = danio_rerio
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "SRR1731762_output_gene_lengthScaledTPM.tsv"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        result = ComputationalResult()
        result.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'SRP051449': ['SRR1731761', 'SRR1731762']}
        ds.aggregate_by = 'SPECIES'
        ds.scale_by = 'NONE'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = True
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = ds
        pjda.save()

        cr = ComputationalResult()
        cr.save()

        computed_file = ComputedFile()
        computed_file.filename = "danio_target.tsv"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = cr
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = False
        computed_file.save()

        cra = ComputationalResultAnnotation()
        cra.data = {'organism_id': danio_rerio.id, 'is_qn': True}
        cra.result = cr
        cra.save()

        final_context = smasher.smash(job.pk, upload=False)
        self.assertTrue(final_context['success'])
コード例 #24
0
ファイル: test_salmon.py プロジェクト: Quiltomics/refinebio
    def test_salmon_quant_two_samples_single_read(self):
        """Test `salmon quant` outputs on two samples that have single
        read and that belong to same experiment.
        """
        prepare_organism_indices()

        # Create one experiment and two related samples, based on:
        #   https://www.ncbi.nlm.nih.gov/sra/?term=SRP040623
        # (For testing purpose, only two of the four samples' data are included.)
        experiment_accession = 'PRJNA242809'
        experiment = Experiment.objects.create(accession_code=experiment_accession)

        c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS")

        ## Sample 1
        sample1_accession = 'SRR1206053'
        sample1 = Sample.objects.create(accession_code=sample1_accession,
                                        organism=c_elegans)
        ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample1)

        experiment_dir = "/home/user/data_store/salmon_tests/PRJNA242809"

        og_file_1 = OriginalFile()
        og_file_1.absolute_file_path = os.path.join(experiment_dir, "raw/SRR1206053.fastq.gz")
        og_file_1.filename = "SRR1206053.fastq.gz"
        og_file_1.save()

        OriginalFileSampleAssociation.objects.create(original_file=og_file_1, sample=sample1).save()

        ## Sample 2
        sample2_accession = 'SRR1206054'
        sample2 = Sample.objects.create(accession_code=sample2_accession,
                                        organism=c_elegans)
        ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample2)

        og_file_2 = OriginalFile()
        og_file_2.absolute_file_path = os.path.join(experiment_dir, "raw/SRR1206054.fastq.gz")
        og_file_2.filename = "SRR1206054.fastq.gz"
        og_file_2.save()

        OriginalFileSampleAssociation.objects.create(original_file=og_file_2, sample=sample2).save()

        # Test `salmon quant` on sample1 (SRR1206053)
        sample1_dir = os.path.join(experiment_dir, sample1_accession)

        job1_context = salmon._prepare_files({"job_dir_prefix": "TEST",
                                              "job_id": "TEST",
                                              'pipeline': Pipeline(name="Salmon"),
                                              'computed_files': [],
                                              "original_files": [og_file_1]})

        # Check quant.sf in `salmon quant` output dir of sample1
        self.check_salmon_quant(job1_context, sample1_dir)
        # Confirm that this experiment is not ready for tximport yet.
        experiments_ready = salmon._get_tximport_inputs(job1_context)
        self.assertEqual(len(experiments_ready), 0)
        # This job should not have produced any tximport output
        # because the other sample isn't ready yet.
        self.assertFalse(os.path.exists(os.path.join(job1_context["work_dir"], 'txi_out.RDS')))

         # Now run `salmon quant` on sample2 (SRR1206054) too
        sample2_dir = os.path.join(experiment_dir, sample2_accession)
        job2_context = salmon._prepare_files({"job_dir_prefix": "TEST2",
                                              "job_id": "TEST2",
                                              'pipeline': Pipeline(name="Salmon"),
                                              'computed_files': [],
                                              "original_files": [og_file_2]})

        # Clean up tximport output:
        rds_filename = os.path.join(job2_context["work_dir"], 'txi_out.RDS')
        if (os.path.isfile(rds_filename)):
            os.remove(rds_filename)

        # Check quant.sf in `salmon quant` output dir of sample2
        self.check_salmon_quant(job2_context, sample2_dir)

        # rds_filename should have been generated by tximport at this point.
        # Note: `tximport` step is launched by subprocess module in Python.
        # If input "quant.sf" files are too large, we may have to wait for
        # a few seconds before testing the existence of rds_filename.
        self.assertTrue(os.path.exists(rds_filename))

        for computed_file in job2_context['computed_files']:
            if computed_file.filename[-4:] == '.RDS':
                rds_file_path = computed_file.absolute_file_path

        cmd_tokens = [
            "/usr/bin/Rscript", "--vanilla",
            "/home/user/data_refinery_workers/processors/test_tximport.R",
            "--txi_out", rds_file_path,
            "--gene2txmap", job2_context["genes_to_transcripts_path"]
        ]

        tximport_test_result = subprocess.run(cmd_tokens, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

        if tximport_test_result.returncode != 0:
            # If the exit code is not 0 then tximport failed so fail the tests.
            self.assertTrue(False)

        # Check the individual files
        self.assertTrue(len(job2_context['individual_files']), 2)
        for file in job2_context['individual_files']:
            self.assertTrue(os.path.isfile(file.absolute_file_path))
コード例 #25
0
ファイル: test_smasher.py プロジェクト: Quiltomics/refinebio
    def test_dualtech_smash(self):
        """ """

        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1487313"
        experiment.save()

        result = ComputationalResult()
        result.save()

        gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS")

        sample = Sample()
        sample.accession_code = 'GSM1487313'
        sample.title = 'GSM1487313'
        sample.organism = gallus_gallus
        sample.technology = "MICROARRAY"
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1487313_liver.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        # RNASEQ TECH
        experiment2 = Experiment()
        experiment2.accession_code = "SRS332914"
        experiment2.save()

        result2 = ComputationalResult()
        result2.save()

        sample2 = Sample()
        sample2.accession_code = 'SRS332914'
        sample2.title = 'SRS332914'
        sample2.organism = gallus_gallus
        sample2.technology = "RNA-SEQ"
        sample2.save()

        sra2 = SampleResultAssociation()
        sra2.sample = sample2
        sra2.result = result2
        sra2.save()

        esa2 = ExperimentSampleAssociation()
        esa2.experiment = experiment2
        esa2.sample = sample2
        esa2.save()

        computed_file2 = ComputedFile()
        computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv"
        computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename
        computed_file2.result = result2
        computed_file2.size_in_bytes = 234
        computed_file2.is_smashable = True
        computed_file2.save()

        assoc2 = SampleComputedFileAssociation()
        assoc2.sample = sample2
        assoc2.computed_file = computed_file2
        assoc2.save()

        # CROSS-SMASH BY SPECIES
        ds = Dataset()
        ds.data = {'GSE1487313': ['GSM1487313'], 'SRX332914': ['SRS332914']}
        ds.aggregate_by = 'SPECIES'
        ds.scale_by = 'STANDARD'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        self.assertTrue(ds.is_cross_technology())
        final_context = smasher.smash(pj.pk, upload=False)
        self.assertTrue(os.path.exists(final_context['output_file']))
        os.remove(final_context['output_file'])
        self.assertEqual(len(final_context['final_frame'].columns), 2)

        # THEN BY EXPERIMENT
        ds.aggregate_by = 'EXPERIMENT'
        ds.save()

        dsid = ds.id
        ds = Dataset.objects.get(id=dsid)

        pj.start_time = None
        pj.end_time = None
        pj.save()

        final_context = smasher.smash(pj.pk, upload=False)
        self.assertTrue(os.path.exists(final_context['output_file']))
        os.remove(final_context['output_file'])
        self.assertEqual(len(final_context['final_frame'].columns), 1)

        # THEN BY ALL
        ds.aggregate_by = 'ALL'
        ds.save()

        dsid = ds.id
        ds = Dataset.objects.get(id=dsid)

        pj.start_time = None
        pj.end_time = None
        pj.save()
        final_context = smasher.smash(pj.pk, upload=False)
        self.assertTrue(os.path.exists(final_context['output_file']))
        self.assertEqual(len(final_context['final_frame'].columns), 2)
コード例 #26
0
    def test_qn_reference(self):
        job = ProcessorJob()
        job.pipeline_applied = "QN_REFERENCE"
        job.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

        experiment = Experiment()
        experiment.accession_code = "12345"
        experiment.save()

        for code in ['1', '2', '3', '4', '5', '6']:
            sample = Sample()
            sample.accession_code = code
            sample.title = code
            sample.platform_accession_code = 'A-MEXP-1171'
            sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS"
            sample.organism = homo_sapiens
            sample.technology = "MICROARRAY"
            sample.is_processed = True
            sample.save()

            cr = ComputationalResult()
            cr.save()

            file = ComputedFile()
            file.filename = code + ".tsv"
            file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv"
            file.size_in_bytes = int(code)
            file.result = cr
            file.is_smashable = True
            file.save()

            scfa = SampleComputedFileAssociation()
            scfa.sample = sample
            scfa.computed_file = file
            scfa.save()

            exsa = ExperimentSampleAssociation()
            exsa.experiment = experiment
            exsa.sample = sample
            exsa.save()

        
        dataset = Dataset()
        dataset.data = {"12345": ["1", "2", "3", "4", "5", "6"]}
        dataset.aggregate_by = "ALL"
        dataset.scale_by = "NONE"
        dataset.quantile_normalize = False # We don't QN because we're creating the target now
        dataset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dataset
        pjda.save()

        final_context = qn_reference.create_qn_reference(job.pk)
        self.assertTrue(final_context['success'])

        self.assertTrue(os.path.exists(final_context['target_file']))
        self.assertEqual(os.path.getsize(final_context['target_file']), 556)

        target = utils.get_most_recent_qn_target_for_organism(homo_sapiens)
        self.assertEqual(target.sha1, '636d72d5cbf4b9785b0bd271a1430b615feaa7ea')

        ###
        # Smasher with QN
        ###

        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        ds = Dataset()
        ds.data = {"12345": ["1", "2", "3", "4", "5"]}
        ds.aggregate_by = 'SPECIES'
        ds.scale_by = 'STANDARD'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = True
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(pj.pk, upload=False)
        self.assertTrue(final_context['success'])

        self.assertEqual(final_context['merged_qn']['1'][0], -0.4379488528812934)
        self.assertEqual(final_context['original_merged']['1'][0], -0.576210936113982)

        ## 
        # Test via management command
        ##

        from django.core.management import call_command
        from django.test import TestCase
        from django.utils.six import StringIO

        out = StringIO()
        try:
            call_command('create_qn_target', organism='homo_sapiens', min=1, stdout=out)
        except SystemExit as e: # this is okay!
            pass

        stdout = out.getvalue()
        self.assertTrue('Target file' in stdout)
        path = stdout.split('\n')[0].split(':')[1].strip()
        self.assertTrue(os.path.exists(path))
        self.assertEqual(path, utils.get_most_recent_qn_target_for_organism(homo_sapiens).absolute_file_path)
コード例 #27
0
def setup_experiment(new_version_accessions: List[str],
                     old_version_accessions: List[str]) -> Dict:
    """ Create an experiment where some samples were processed with the newest version of salmon and
    other with an older one.
    """
    # Create the experiment
    experiment_accession = "SRP095529"
    data_dir = "/home/user/data_store/"
    experiment_dir = data_dir + experiment_accession
    experiment = Experiment.objects.create(accession_code=experiment_accession,
                                           technology="RNA-SEQ")

    zebrafish = Organism.get_object_for_name("DANIO_RERIO")

    # Create the transcriptome processor and result:
    transcriptome_processor = Processor()
    transcriptome_processor.name = "Transcriptome"
    transcriptome_processor.version = "salmon 0.9.1"
    transcriptome_processor.docker_image = "dr_transcriptome"
    transcriptome_processor.environment = '{"some": "environment"}'
    transcriptome_processor.save()
    computational_result_short = ComputationalResult(
        processor=transcriptome_processor)
    computational_result_short.save()

    organism_index = OrganismIndex()
    organism_index.index_type = "TRANSCRIPTOME_SHORT"
    organism_index.organism = zebrafish
    organism_index.result = computational_result_short
    organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT"
    organism_index.salmon_version = "salmon 0.9.1"
    organism_index.save()

    comp_file = ComputedFile()
    # This path will not be used because we already have the files extracted.
    comp_file.absolute_file_path = (
        "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz")
    comp_file.result = computational_result_short
    comp_file.size_in_bytes = 1337
    comp_file.sha1 = "ABC"
    comp_file.s3_key = "key"
    comp_file.s3_bucket = "bucket"
    comp_file.save()

    quant_processor = Processor()
    quant_processor.name = "Salmon Quant"
    quant_processor.version = "salmon 0.9.1"
    quant_processor.docker_image = "dr_salmon"
    quant_processor.environment = '{"some": "environment"}'
    quant_processor.save()

    for accession_code in old_version_accessions:
        sample = Sample.objects.create(
            accession_code=accession_code,
            organism=zebrafish,
            source_database="SRA",
            technology="RNA-SEQ",
            platform_accession_code="IlluminaHiSeq1000",
        )
        ExperimentSampleAssociation.objects.create(experiment=experiment,
                                                   sample=sample)

        original_file = OriginalFile()
        original_file.filename = accession_code + ".SRA"
        original_file.source_filename = accession_code + ".SRA"
        original_file.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            original_file=original_file, sample=sample)

        # Create and associate quant result and files.
        quant_result = ComputationalResult()
        quant_result.is_ccdl = True
        quant_result.processor = quant_processor
        quant_result.organism_index = organism_index  # associate with OLD organism index
        quant_result.save()

        kv = ComputationalResultAnnotation()
        kv.data = {"index_length": "short"}
        kv.result = quant_result
        kv.is_public = True
        kv.save()

        # In prod the filename pattern will involve the timestamp
        # but here we're using the accession code so we can find
        # the archive file for the current sample.
        archive_filename = "result-" + accession_code + ".tar.gz"
        archive_file = ComputedFile()
        archive_file.filename = archive_filename
        archive_file.absolute_file_path = os.path.join(experiment_dir,
                                                       archive_filename)
        archive_file.is_public = False
        archive_file.is_smashable = False
        archive_file.is_qc = False
        archive_file.result = quant_result
        archive_file.size_in_bytes = 12345
        archive_file.save()

        quant_file = ComputedFile()
        quant_file.filename = "quant.sf"
        quant_file.absolute_file_path = (experiment_dir + "/quant_files/" +
                                         accession_code + "_output/quant.sf")
        quant_file.is_public = False
        quant_file.is_smashable = False
        quant_file.is_qc = False
        quant_file.result = quant_result
        quant_file.size_in_bytes = 12345
        quant_file.s3_bucket = "bucket"
        quant_file.s3_key = "key"
        quant_file.save()

        SampleResultAssociation.objects.get_or_create(sample=sample,
                                                      result=quant_result)

    # Create another OrganismIndex with a newer version of
    transcriptome_processor = Processor()
    transcriptome_processor.name = "Transcriptome"
    transcriptome_processor.version = "salmon 0.13.1"
    transcriptome_processor.docker_image = "dr_transcriptome"
    transcriptome_processor.environment = '{"some": "environment"}'
    transcriptome_processor.save()
    computational_result_short = ComputationalResult(
        processor=transcriptome_processor)
    computational_result_short.save()

    organism_index = OrganismIndex()
    organism_index.index_type = "TRANSCRIPTOME_SHORT"
    organism_index.organism = zebrafish
    organism_index.result = computational_result_short
    organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT"
    organism_index.salmon_version = "salmon 0.13.1"  # DIFFERENT SALMON VERSION
    organism_index.save()

    comp_file = ComputedFile()
    # This path will not be used because we already have the files extracted.
    comp_file.absolute_file_path = (
        "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz")
    comp_file.result = computational_result_short
    comp_file.size_in_bytes = 1337
    comp_file.sha1 = "ABC"
    comp_file.s3_key = "key"
    comp_file.s3_bucket = "bucket"
    comp_file.save()

    for accession_code in new_version_accessions:
        sample = Sample.objects.create(
            accession_code=accession_code,
            organism=zebrafish,
            source_database="SRA",
            technology="RNA-SEQ",
            platform_accession_code="IlluminaHiSeq1000",
        )
        ExperimentSampleAssociation.objects.create(experiment=experiment,
                                                   sample=sample)

        original_file = OriginalFile()
        original_file.filename = accession_code + ".SRA"
        original_file.source_filename = accession_code + ".SRA"
        original_file.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            original_file=original_file, sample=sample)

        # Create and associate quant result and files.
        quant_result = ComputationalResult()
        quant_result.is_ccdl = True
        quant_result.processor = quant_processor
        quant_result.organism_index = organism_index  # NEWER VERSION
        quant_result.save()

        kv = ComputationalResultAnnotation()
        kv.data = {"index_length": "short"}
        kv.result = quant_result
        kv.is_public = True
        kv.save()

        # In prod the filename pattern will involve the timestamp
        # but here we're using the accession code so we can find
        # the archive file for the current sample.
        archive_filename = "result-" + accession_code + ".tar.gz"
        archive_file = ComputedFile()
        archive_file.filename = archive_filename
        archive_file.absolute_file_path = os.path.join(experiment_dir,
                                                       archive_filename)
        archive_file.is_public = False
        archive_file.is_smashable = False
        archive_file.is_qc = False
        archive_file.result = quant_result
        archive_file.size_in_bytes = 12345
        archive_file.save()

        quant_file = ComputedFile()
        quant_file.filename = "quant.sf"
        quant_file.absolute_file_path = (experiment_dir + "/quant_files/" +
                                         accession_code + "_output/quant.sf")
        quant_file.is_public = False
        quant_file.is_smashable = False
        quant_file.is_qc = False
        quant_file.result = quant_result
        quant_file.size_in_bytes = 12345
        quant_file.s3_bucket = "bucket"
        quant_file.s3_key = "key"
        quant_file.save()

        SampleResultAssociation.objects.get_or_create(sample=sample,
                                                      result=quant_result)

    return experiment
コード例 #28
0
    def test_make_experiment_result_associations(self):
        """Tests that the correct associations are made.

        The situation we're setting up is basically this:
          * tximport has been run for an experiment.
          * It made associations between the samples in
            the experiment and the ComputationalResult.
          * It didn't make associations between the
            experiment itself and the ComputationalResult.
          * There is a second experiment that hasn't had
            tximport run but shares a sample with the
            other experiment.
          * This second experiment has a sample which has
            not yet had tximport run on it.

        And what we're going to test for is:
          * An association is created between the tximport
            result and the first experiment.
          * An association is NOT created between the
            tximport result and the second experiment.
        """
        # Get an organism to set on samples:
        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS",
                                                    taxonomy_id=9606)

        # Create the tximport processor and result:
        processor = Processor()
        processor.name = "Tximport"
        processor.version = "v9.9.9"
        processor.docker_image = "dr_salmon"
        processor.environment = '{"some": "environment"}'
        processor.save()

        result = ComputationalResult()
        result.commands.append("tximport invocation")
        result.is_ccdl = True
        result.processor = processor
        result.save()

        # Create the first experiment and it's samples:
        processed_experiment = Experiment()
        processed_experiment.accession_code = "SRP12345"
        processed_experiment.save()

        processed_sample_one = Sample()
        processed_sample_one.accession_code = "SRX12345"
        processed_sample_one.title = "SRX12345"
        processed_sample_one.organism = homo_sapiens
        processed_sample_one.save()

        sra = SampleResultAssociation()
        sra.sample = processed_sample_one
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = processed_experiment
        esa.sample = processed_sample_one
        esa.save()

        processed_sample_two = Sample()
        processed_sample_two.accession_code = "SRX12346"
        processed_sample_two.title = "SRX12346"
        processed_sample_two.organism = homo_sapiens
        processed_sample_two.save()

        sra = SampleResultAssociation()
        sra.sample = processed_sample_two
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = processed_experiment
        esa.sample = processed_sample_two
        esa.save()

        # Create the second experiment and it's additional sample.
        unprocessed_experiment = Experiment()
        unprocessed_experiment.accession_code = "SRP6789"
        unprocessed_experiment.save()

        unprocessed_sample = Sample()
        unprocessed_sample.accession_code = "SRX6789"
        unprocessed_sample.title = "SRX6789"
        unprocessed_sample.organism = homo_sapiens
        unprocessed_sample.save()

        sra = SampleResultAssociation()
        sra.sample = unprocessed_sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = unprocessed_experiment
        esa.sample = unprocessed_sample
        esa.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = unprocessed_experiment
        esa.sample = processed_sample_two
        esa.save()

        # Run the function we're testing:
        make_experiment_result_associations()

        # Test that only one association was created and that it was
        # to the processed experiment:
        eras = ExperimentResultAssociation.objects.all()

        self.assertEqual(len(eras), 1)
        self.assertEqual(eras.first().experiment, processed_experiment)
コード例 #29
0
def _populate_index_object(job_context: Dict) -> Dict:
    """ """

    result = ComputationalResult()
    result.commands.append(job_context["salmon_formatted_command"])
    try:
        processor_key = "TX_INDEX"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)

    result.is_ccdl = True
    result.time_start = job_context["time_start"]
    result.time_end = job_context["time_end"]
    result.save()
    job_context['pipeline'].steps.append(result.id)

    computed_file = ComputedFile()
    computed_file.absolute_file_path = job_context["computed_archive"]
    computed_file.filename = os.path.split(job_context["computed_archive"])[-1]
    computed_file.calculate_sha1()
    computed_file.calculate_size()
    computed_file.result = result
    computed_file.is_smashable = False
    computed_file.is_qc = False
    computed_file.save()

    organism_object = Organism.get_object_for_name(job_context['organism_name'])
    index_object = OrganismIndex()
    index_object.organism = organism_object
    index_object.source_version = job_context["assembly_version"]
    index_object.assembly_name = job_context["assembly_name"]
    index_object.salmon_version = job_context["salmon_version"]
    index_object.index_type = "TRANSCRIPTOME_" + job_context['length'].upper()
    # This is where the index will be extracted to.
    index_object.absolute_directory_path = LOCAL_ROOT_DIR + "/TRANSCRIPTOME_INDEX/" \
                                           + organism_object.name + "/" + job_context['length']
    index_object.result = result

    if S3_TRANSCRIPTOME_INDEX_BUCKET_NAME:
        logger.info("Uploading %s %s to s3", job_context['organism_name'], job_context['length'], processor_job=job_context["job_id"])
        timestamp = str(timezone.now().timestamp()).split('.')[0]
        s3_key = organism_object.name + '_' + index_object.index_type + "_" + timestamp + '.tar.gz'
        sync_result = computed_file.sync_to_s3(S3_TRANSCRIPTOME_INDEX_BUCKET_NAME, s3_key)
        if sync_result:
            computed_file.delete_local_file()
    else:
        logger.warn("S3_TRANSCRIPTOME_INDEX_BUCKET_NAME not configured, therefore %s %s will not be uploaded.",
                    job_context['organism_name'],
                    job_context['length'],
                    processor_job=job_context["job_id"])

    index_object.save()

    # We uploaded the file ourselves since we wanted it to go to a
    # different bucket than end_job would put it in, therefore empty
    # this list so end_job doesn't try to upload it again.
    job_context['computed_files'] = []

    job_context['result'] = result
    job_context['computed_file'] = computed_file
    job_context['index'] = index_object

    # If there's not a long and a short index for this organism yet,
    # don't delete the input.
    # XXX: This will break once we introduce additional versions of these.
    short_indices = OrganismIndex.objects.filter(organism=organism_object,
                                                 index_type="TRANSCRIPTOME_SHORT",
                                                 source_version=job_context["assembly_version"])
    long_indices = OrganismIndex.objects.filter(organism=organism_object,
                                                 index_type="TRANSCRIPTOME_LONG",
                                                 source_version=job_context["assembly_version"])
    if short_indices.count() < 1 or long_indices.count() < 1:
        # utils.end_job deletes these, so remove them so it doesn't.
        job_context["original_files"] = []

    return job_context
コード例 #30
0
    def handle(self, *args, **options):
        """
        """

        if not options["job_id"]:
            if options["organism"] is None and not options["all"]:
                logger.error("You must specify an organism or --all")
                sys.exit(1)

            if options["organism"] and (options.get("organism", "") != "ALL"):
                organisms = [
                    Organism.get_object_for_name(options["organism"].upper())
                ]
            else:
                organisms = Organism.objects.all()

            for organism in organisms:
                if not organism_can_have_qn_target(organism):
                    logger.error(
                        "Organism does not have any platform with enough samples to generate a qn target",
                        organism=organism,
                        min=options["min"],
                    )
                    continue

                samples = organism.sample_set.filter(has_raw=True,
                                                     technology="MICROARRAY",
                                                     is_processed=True)
                if samples.count() == 0:
                    logger.error(
                        "No processed samples for organism.",
                        organism=organism,
                        count=samples.count(),
                    )
                    continue

                if options["platform"] is None:
                    platform_counts = (
                        samples.values("platform_accession_code").annotate(
                            dcount=Count("platform_accession_code")).order_by(
                                "-dcount"))
                    biggest_platform = platform_counts[0][
                        "platform_accession_code"]
                else:
                    biggest_platform = options["platform"]

                sample_codes_results = Sample.processed_objects.filter(
                    platform_accession_code=biggest_platform,
                    has_raw=True,
                    technology="MICROARRAY",
                    organism=organism,
                    is_processed=True,
                ).values("accession_code")
                sample_codes = [
                    res["accession_code"] for res in sample_codes_results
                ]

                dataset = Dataset()
                dataset.data = {
                    organism.name + "_(" + biggest_platform + ")": sample_codes
                }
                dataset.aggregate_by = "ALL"
                dataset.scale_by = "NONE"
                dataset.quantile_normalize = False
                dataset.save()

                job = ProcessorJob()
                job.pipeline_applied = "QN_REFERENCE"
                job.save()

                pjda = ProcessorJobDatasetAssociation()
                pjda.processor_job = job
                pjda.dataset = dataset
                pjda.save()

                final_context = qn_reference.create_qn_reference(job.pk)

                if final_context["success"]:
                    print(":D")
                    self.stdout.write("Target file: " +
                                      final_context["target_file"])
                    self.stdout.write(
                        "Target S3: " +
                        str(final_context["computed_files"][0].get_s3_url()))
                else:
                    print(":(")
        else:
            qn_reference.create_qn_reference(options["job_id"])