def prepare_organism_indices(): c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") # This is a lie, but this image doesn't have the dependencies for TX_IMPORT computational_result_short = ComputationalResult(processor=utils.find_processor('SALMON_QUANT')) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = c_elegans organism_index.result = computational_result_short organism_index.absolute_directory_path = "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT" organism_index.save() comp_file = ComputedFile() # This path will not be used because we already have the files extracted. comp_file.absolute_file_path = "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT/celgans_short.tar.gz" comp_file.result = computational_result_short comp_file.size_in_bytes=1337 comp_file.sha1="ABC" comp_file.save() # This is a lie, but this image doesn't have the dependencies for TX_IMPORT computational_result_long = ComputationalResult(processor=utils.find_processor('SALMON_QUANT')) computational_result_long.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_LONG" organism_index.organism = c_elegans organism_index.result = computational_result_long organism_index.absolute_directory_path = "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/LONG" organism_index.save() comp_file = ComputedFile() # This path will not be used because we already have the files extracted. comp_file.absolute_file_path = "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/LONG/celgans_long.tar.gz" comp_file.result = computational_result_long comp_file.size_in_bytes=1337 comp_file.sha1="ABC" comp_file.save()
def handle(self, *args, **options): """ For every (or a supplied) organism, fetch all of the experiments and compile large but normally formated Dataset. Send all of them to the Smasher. Smash them. Retrieve manually as desired. """ dataset_ids = [] if options["organism"] is None: all_organisms = Organism.objects.all() else: all_organisms = [Organism.get_object_for_name(options["organism"].upper())] for organism in all_organisms: data = {} experiments = Experiment.objects.filter(id__in=(ExperimentOrganismAssociation.objects.filter(organism=organism)).values('experiment')) for experiment in experiments: data[experiment.accession_code] = list(experiment.samples.values_list('accession_code', flat=True)) job = ProcessorJob() job.pipeline_applied = "COMPENDIA" job.save() dset = Dataset() dset.data = data dset.scale_by = 'NONE' dset.aggregate_by = 'SPECIES' dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) sys.exit(0)
def test_double_reads(self): """Test outputs when the sample has both left and right reads.""" job_context = { 'job_id': 123, 'job': ProcessorJob(), 'pipeline': Pipeline(name="Salmon"), 'input_file_path': self.test_dir + 'double_input/reads_1.fastq', 'input_file_path_2': self.test_dir + 'double_input/reads_2.fastq', 'salmontools_directory': self.test_dir + 'double_salmontools/', 'salmontools_archive': self.test_dir + 'salmontools-result.tar.gz', 'output_directory': self.test_dir + 'double_output/', 'computed_files': [] } os.makedirs(job_context["salmontools_directory"], exist_ok=True) homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.organism = homo_sapiens sample.save() job_context["sample"] = sample salmon._run_salmontools(job_context) # Confirm job status self.assertTrue(job_context["success"]) # Unpack result for checking os.system('gunzip ' + job_context['salmontools_directory'] + "*.gz") # Check two output files output_file1 = job_context['salmontools_directory'] + 'unmapped_by_salmon_1.fa' expected_output_file1 = self.test_dir + 'expected_double_output/unmapped_by_salmon_1.fa' self.assertTrue(identical_checksum(output_file1, expected_output_file1)) output_file2 = job_context['salmontools_directory'] + 'unmapped_by_salmon_2.fa' expected_output_file2 = self.test_dir + 'expected_double_output/unmapped_by_salmon_2.fa' self.assertTrue(identical_checksum(output_file2, expected_output_file2))
def prepare_job(length): pj = ProcessorJob() pj.pipeline_applied = "TRANSCRIPTOME_INDEX_" + length.upper() pj.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS", taxonomy_id=1001) samp = Sample() samp.organism = homo_sapiens samp.accession_code = "derp" + length samp.save() [og_file, og_file2] = prepare_original_files(length) og_file_samp_assoc = OriginalFileSampleAssociation() og_file_samp_assoc.original_file = og_file og_file_samp_assoc.sample = samp og_file_samp_assoc.save() og_file_samp_assoc2 = OriginalFileSampleAssociation() og_file_samp_assoc2.original_file = og_file2 og_file_samp_assoc2.sample = samp og_file_samp_assoc2.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() assoc2 = ProcessorJobOriginalFileAssociation() assoc2.original_file = og_file2 assoc2.processor_job = pj assoc2.save() return pj
def _perform_imputation(job_context: Dict) -> Dict: """ Take the inputs and perform the primary imputation. Via https://github.com/AlexsLemonade/refinebio/issues/508#issuecomment-435879283: - Combine all microarray samples with a full join to form a microarray_expression_matrix (this may end up being a DataFrame) - Combine all RNA-seq samples (lengthScaledTPM) with a full outer join to form a rnaseq_expression_matrix - Calculate the sum of the lengthScaledTPM values for each row (gene) of the rnaseq_expression_matrix (rnaseq_row_sums) - Calculate the 10th percentile of rnaseq_row_sums - Drop all rows in rnaseq_expression_matrix with a row sum < 10th percentile of rnaseq_row_sums; this is now filtered_rnaseq_matrix - log2(x + 1) transform filtered_rnaseq_matrix; this is now log2_rnaseq_matrix - Set all zero values in log2_rnaseq_matrix to NA, but make sure to keep track of where these zeroes are - Perform a full outer join of microarray_expression_matrix and log2_rnaseq_matrix; combined_matrix - Remove genes (rows) with >30% missing values in combined_matrix - Remove samples (columns) with >50% missing values in combined_matrix - "Reset" zero values that were set to NA in RNA-seq samples (i.e., make these zero again) in combined_matrix - Transpose combined_matrix; transposed_matrix - Perform imputation of missing values with IterativeSVD (rank=10) on the transposed_matrix; imputed_matrix - Untranspose imputed_matrix (genes are now rows, samples are now columns) - Quantile normalize imputed_matrix where genes are rows and samples are columns """ job_context['time_start'] = timezone.now() # Combine all microarray samples with a full join to form a microarray_expression_matrix (this may end up being a DataFrame) microarray_expression_matrix = job_context['microarray_inputs'] # Combine all RNA-seq samples (lengthScaledTPM) with a full outer join to form a rnaseq_expression_matrix rnaseq_expression_matrix = job_context['rnaseq_inputs'] # Calculate the sum of the lengthScaledTPM values for each row (gene) of the rnaseq_expression_matrix (rnaseq_row_sums) rnaseq_row_sums = np.sum(rnaseq_expression_matrix, axis=1) # Calculate the 10th percentile of rnaseq_row_sums rnaseq_tenth_percentile = np.percentile(rnaseq_row_sums, 10) # Drop all rows in rnaseq_expression_matrix with a row sum < 10th percentile of rnaseq_row_sums; this is now filtered_rnaseq_matrix # TODO: This is probably a better way to do this with `np.where` rows_to_filter = [] for (x, sum_val) in rnaseq_row_sums.items(): if sum_val < rnaseq_tenth_percentile: rows_to_filter.append(x) filtered_rnaseq_matrix = rnaseq_expression_matrix.drop(rows_to_filter) # log2(x + 1) transform filtered_rnaseq_matrix; this is now log2_rnaseq_matrix filtered_rnaseq_matrix_plus_one = filtered_rnaseq_matrix + 1 log2_rnaseq_matrix = np.log2(filtered_rnaseq_matrix_plus_one) # Cache our RNA-Seq zero values cached_zeroes = {} for column in log2_rnaseq_matrix.columns: cached_zeroes[column] = np.where(log2_rnaseq_matrix[column] == 0) # Set all zero values in log2_rnaseq_matrix to NA, but make sure to keep track of where these zeroes are log2_rnaseq_matrix[log2_rnaseq_matrix==0]=np.nan # Perform a full outer join of microarray_expression_matrix and log2_rnaseq_matrix; combined_matrix combined_matrix = microarray_expression_matrix.merge(log2_rnaseq_matrix, how='outer', left_index=True, right_index=True) # Remove genes (rows) with <=70% present values in combined_matrix thresh = combined_matrix.shape[1] * .7 # (Rows, Columns) row_filtered_combined_matrix = combined_matrix.dropna(axis='index', thresh=thresh) # Everything below `thresh` is dropped # Remove samples (columns) with <50% present values in combined_matrix # XXX: Find better test data for this! col_thresh = row_filtered_combined_matrix.shape[0] * .5 row_col_filtered_combined_matrix_samples = row_filtered_combined_matrix.dropna(axis='columns', thresh=col_thresh) # "Reset" zero values that were set to NA in RNA-seq samples (i.e., make these zero again) in combined_matrix for column in cached_zeroes.keys(): zeroes = cached_zeroes[column] # Skip purged columns if column not in row_col_filtered_combined_matrix_samples: continue # Place the zero try: np.put(row_col_filtered_combined_matrix_samples[column], zeroes, 0.0) except Exception as e: logger.exception("Error when replacing zero") continue # Label our new replaced data combined_matrix_zero = row_col_filtered_combined_matrix_samples # Transpose combined_matrix; transposed_matrix transposed_matrix = combined_matrix_zero.transpose() # row_col_filtered_combined_matrix_samples.transpose() # Remove -inf and inf # This should never happen, but make sure it doesn't! transposed_matrix = transposed_matrix.replace([np.inf, -np.inf], np.nan) # Perform imputation of missing values with IterativeSVD (rank=10) on the transposed_matrix; imputed_matrix imputed_matrix = IterativeSVD(rank=10).fit_transform(transposed_matrix) # Untranspose imputed_matrix (genes are now rows, samples are now columns) untransposed_imputed_matrix = imputed_matrix.transpose() # Convert back to Pandas untransposed_imputed_matrix_df = pd.DataFrame.from_records(untransposed_imputed_matrix) untransposed_imputed_matrix_df.index = row_col_filtered_combined_matrix_samples.index untransposed_imputed_matrix_df.columns = row_col_filtered_combined_matrix_samples.columns # Quantile normalize imputed_matrix where genes are rows and samples are columns # XXX: Refactor QN target acquisition and application before doing this job_context['organism'] = Organism.get_object_for_name(list(job_context['input_files'].keys())[0]) job_context['merged_no_qn'] = untransposed_imputed_matrix_df # Perform the Quantile Normalization job_context = smasher._quantile_normalize(job_context, ks_check=False) job_context['time_end'] = timezone.now() job_context['formatted_command'] = "create_compendia.py" return job_context
def prepare_computed_files(): # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1487313" experiment.num_processed_samples = 1 experiment.save() result = ComputationalResult() result.save() gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS", taxonomy_id=1001) sample = Sample() sample.accession_code = "GSM1487313" sample.title = "GSM1487313" sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.is_processed = True sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487313_liver.PCL" computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.s3_key = "GSM1487313_liver.PCL" computed_file.s3_bucket = TEST_DATA_BUCKET computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # RNASEQ TECH experiment2 = Experiment() experiment2.accession_code = "SRP332914" experiment2.num_processed_samples = 1 experiment2.save() result2 = ComputationalResult() result2.save() sample2 = Sample() sample2.accession_code = "SRR332914" sample2.title = "SRR332914" sample2.organism = gallus_gallus sample2.technology = "RNA-SEQ" sample2.is_processed = True sample2.save() sra2 = SampleResultAssociation() sra2.sample = sample2 sra2.result = result2 sra2.save() esa2 = ExperimentSampleAssociation() esa2.experiment = experiment2 esa2.sample = sample2 esa2.save() computed_file2 = ComputedFile() computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv" computed_file2.result = result2 computed_file2.size_in_bytes = 234 computed_file2.is_smashable = True computed_file2.s3_key = "SRP149598_gene_lengthScaledTPM.tsv" computed_file2.s3_bucket = TEST_DATA_BUCKET computed_file2.save() assoc2 = SampleComputedFileAssociation() assoc2.sample = sample2 assoc2.computed_file = computed_file2 assoc2.save()
def _generate_experiment_and_samples( self, run_accession: str, study_accession: str = None) -> (Experiment, List[Sample]): """Generates Experiments and Samples for the provided run_accession.""" metadata = SraSurveyor.gather_all_metadata(run_accession) if metadata == {}: if study_accession: logger.error( "Could not discover any metadata for run.", accession=run_accession, study_accession=study_accession, ) else: logger.error("Could not discover any metadata for run.", accession=run_accession) return (None, None) # This will cascade properly if DOWNLOAD_SOURCE == "ENA": if metadata["library_layout"] == "PAIRED": files_urls = [ _build_ena_file_url(run_accession, "_1"), _build_ena_file_url(run_accession, "_2"), ] else: files_urls = [_build_ena_file_url(run_accession)] else: files_urls = [SraSurveyor._build_ncbi_file_url(run_accession)] # Figure out the Organism for this sample organism_name = metadata.pop("organism_name", None) if not organism_name: logger.error("Could not discover organism type for run.", accession=run_accession) return (None, None) # This will cascade properly organism_name = organism_name.upper() organism = Organism.get_object_for_name(organism_name) ## # Experiment ## experiment_accession_code = metadata.get("study_accession") try: experiment_object = Experiment.objects.get( accession_code=experiment_accession_code) logger.debug( "Experiment already exists, skipping object creation.", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id, ) except Experiment.DoesNotExist: experiment_object = Experiment() experiment_object.accession_code = experiment_accession_code SraSurveyor._apply_metadata_to_experiment(experiment_object, metadata) experiment_object.save() ## # Experiment Metadata ## json_xa = ExperimentAnnotation() json_xa.experiment = experiment_object json_xa.data = metadata json_xa.is_ccdl = False json_xa.save() ## # Samples ## sample_accession_code = metadata.pop("run_accession") # Create the sample object try: sample_object = Sample.objects.get( accession_code=sample_accession_code) # If current experiment includes new protocol information, # merge it into the sample's existing protocol_info. protocol_info, is_updated = self.update_sample_protocol_info( sample_object.protocol_info, experiment_object.protocol_description, experiment_object.source_url, ) if is_updated: sample_object.protocol_info = protocol_info sample_object.save() logger.debug( "Sample %s already exists, skipping object creation.", sample_accession_code, experiment_accession_code=experiment_object.accession_code, survey_job=self.survey_job.id, ) except Sample.DoesNotExist: sample_object = Sample() sample_object.source_database = "SRA" sample_object.accession_code = sample_accession_code sample_object.organism = organism sample_object.platform_name = metadata.get( "platform_instrument_model", "UNKNOWN") # The platform_name is human readable and contains spaces, # accession codes shouldn't have spaces though: sample_object.platform_accession_code = sample_object.platform_name.replace( " ", "") sample_object.technology = "RNA-SEQ" if ("ILLUMINA" in sample_object.platform_name.upper() or "NEXTSEQ" in sample_object.platform_name.upper()): sample_object.manufacturer = "ILLUMINA" elif "ION TORRENT" in sample_object.platform_name.upper(): sample_object.manufacturer = "ION_TORRENT" else: sample_object.manufacturer = "UNKNOWN" SraSurveyor._apply_harmonized_metadata_to_sample( sample_object, metadata) protocol_info, is_updated = self.update_sample_protocol_info( existing_protocols=[], experiment_protocol=experiment_object.protocol_description, experiment_url=experiment_object.source_url, ) # Do not check is_updated the first time because we must # save a list so we can append to it later. sample_object.protocol_info = protocol_info sample_object.save() for file_url in files_urls: original_file = OriginalFile.objects.get_or_create( source_url=file_url, source_filename=file_url.split("/")[-1], has_raw=True)[0] OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample_object) # Create associations if they don't already exist ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=organism) return experiment_object, [sample_object]
def create_experiment_and_samples_from_api( self, experiment_accession_code) -> (Experiment, List[Sample]): """ The main surveyor - find the Experiment and Samples from NCBI GEO. Uses the GEOParse library, for which docs can be found here: https://geoparse.readthedocs.io/en/latest/usage.html#working-with-geo-objects """ # Cleaning up is tracked here: https://github.com/guma44/GEOparse/issues/41 gse = GEOparse.get_GEO(experiment_accession_code, destdir=self.get_temp_path(), how="brief", silent=True) preprocessed_samples = harmony.preprocess_geo(gse.gsms.items()) harmonized_samples = harmony.harmonize(preprocessed_samples) # Create the experiment object try: experiment_object = Experiment.objects.get( accession_code=experiment_accession_code) logger.debug( "Experiment %s already exists, skipping object creation.", experiment_accession_code, survey_job=self.survey_job.id, ) except Experiment.DoesNotExist: experiment_object = Experiment() experiment_object.accession_code = experiment_accession_code GeoSurveyor._apply_metadata_to_experiment(experiment_object, gse) experiment_object.save() experiment_annotation = ExperimentAnnotation() experiment_annotation.data = gse.metadata experiment_annotation.experiment = experiment_object experiment_annotation.is_ccdl = False experiment_annotation.save() # Okay, here's the situation! # Sometimes, samples have a direct single representation for themselves. # Othertimes, there is a single file with references to every sample in it. created_samples = [] for sample_accession_code, sample in gse.gsms.items(): try: sample_object = Sample.objects.get( accession_code=sample_accession_code) logger.debug( "Sample %s from experiment %s already exists, skipping object creation.", sample_accession_code, experiment_object.accession_code, survey_job=self.survey_job.id, ) # Associate it with the experiment, but since it # already exists it already has original files # associated with it and it's already been downloaded, # so don't add it to created_samples. ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=sample_object.organism) except Sample.DoesNotExist: organism = Organism.get_object_for_name( sample.metadata["organism_ch1"][0].upper()) sample_object = Sample() sample_object.source_database = "GEO" sample_object.accession_code = sample_accession_code sample_object.organism = organism # If data processing step, it isn't raw. sample_object.has_raw = not sample.metadata.get( "data_processing", None) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=organism) sample_object.title = sample.metadata["title"][0] self.set_platform_properties(sample_object, sample.metadata, gse) GeoSurveyor._apply_harmonized_metadata_to_sample( sample_object, harmonized_samples[sample_object.title]) # Sample-level protocol_info sample_object.protocol_info = self.get_sample_protocol_info( sample.metadata, sample_accession_code) sample_object.save() logger.debug("Created Sample: " + str(sample_object)) sample_annotation = SampleAnnotation() sample_annotation.sample = sample_object sample_annotation.data = sample.metadata sample_annotation.is_ccdl = False sample_annotation.save() sample_supplements = sample.metadata.get( "supplementary_file", []) for supplementary_file_url in sample_supplements: # Why do they give us this? if supplementary_file_url == "NONE": break # We never want these! if "idat.gz" in supplementary_file_url.lower(): continue if "chp.gz" in supplementary_file_url.lower(): continue if "ndf.gz" in supplementary_file_url.lower(): continue if "pos.gz" in supplementary_file_url.lower(): continue if "pair.gz" in supplementary_file_url.lower(): continue if "gff.gz" in supplementary_file_url.lower(): continue # Sometimes, we are lied to about the data processing step. lower_file_url = supplementary_file_url.lower() if (".cel" in lower_file_url or ("_non_normalized.txt" in lower_file_url) or ("_non-normalized.txt" in lower_file_url) or ("-non-normalized.txt" in lower_file_url) or ("-non_normalized.txt" in lower_file_url)): sample_object.has_raw = True sample_object.save() # filename and source_filename are the same for these filename = FileUtils.get_filename(supplementary_file_url) original_file = OriginalFile.objects.get_or_create( source_url=supplementary_file_url, filename=filename, source_filename=filename, has_raw=sample_object.has_raw, is_archive=FileUtils.is_archive(filename), )[0] logger.debug("Created OriginalFile: " + str(original_file)) original_file_sample_association = OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample_object) if original_file.is_affy_data(): # Only Affymetrix Microarrays produce .CEL files sample_object.technology = "MICROARRAY" sample_object.manufacturer = "AFFYMETRIX" sample_object.save() # It's okay to survey RNA-Seq samples from GEO, but we # don't actually want to download/process any RNA-Seq # data unless it comes from SRA. if sample_object.technology != "RNA-SEQ": created_samples.append(sample_object) # Now that we've determined the technology at the # sample level, we can set it at the experiment level, # just gotta make sure to only do it once. There can # be more than one technology, this should be changed # as part of: # https://github.com/AlexsLemonade/refinebio/issues/1099 if not experiment_object.technology: experiment_object.technology = sample_object.technology experiment_object.save() ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) # These supplementary files _may-or-may-not_ contain the type of raw data we can process. for experiment_supplement_url in gse.metadata.get( "supplementary_file", []): # filename and source_filename are the same for these filename = experiment_supplement_url.split("/")[-1] original_file = OriginalFile.objects.get_or_create( source_url=experiment_supplement_url, filename=filename, source_filename=filename, has_raw=sample_object.has_raw, is_archive=True, )[0] logger.debug("Created OriginalFile: " + str(original_file)) lower_supplement_url = experiment_supplement_url.lower() if (("_non_normalized.txt" in lower_supplement_url) or ("_non-normalized.txt" in lower_supplement_url) or ("-non-normalized.txt" in lower_supplement_url) or ("-non_normalized.txt" in lower_supplement_url)): for sample_object in created_samples: sample_object.has_raw = True sample_object.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample_object, original_file=original_file) # Delete this Original file if it isn't being used. if (OriginalFileSampleAssociation.objects.filter( original_file=original_file).count() == 0): original_file.delete() # These are the Miniml/Soft/Matrix URLs that are always(?) provided. # GEO describes different types of data formatting as "families" family_url = self.get_miniml_url(experiment_accession_code) miniml_original_file = OriginalFile.objects.get_or_create( source_url=family_url, source_filename=family_url.split("/")[-1], has_raw=sample_object.has_raw, is_archive=True, )[0] for sample_object in created_samples: # We don't need a .txt if we have a .CEL if sample_object.has_raw: continue OriginalFileSampleAssociation.objects.get_or_create( sample=sample_object, original_file=miniml_original_file) # Delete this Original file if it isn't being used. if (OriginalFileSampleAssociation.objects.filter( original_file=miniml_original_file).count() == 0): miniml_original_file.delete() # Trash the temp path try: shutil.rmtree(self.get_temp_path()) except Exception: # There was a problem during surveying so this didn't get created. # It's not a big deal. pass return experiment_object, created_samples
def test_log2(self): pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() # Has non-log2 data: # https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE44421 # ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE44nnn/GSE44421/miniml/GSE44421_family.xml.tgz experiment = Experiment() experiment.accession_code = "GSE44421" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1084806' sample.title = 'GSM1084806' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1084806-tbl-1.txt" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1084807' sample.title = 'GSM1084807' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1084807-tbl-1.txt" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE44421': ['GSM1084806', 'GSM1084807']} ds.aggregate_by = 'EXPERIMENT' ds.scale_by = 'MINMAX' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) ds = Dataset.objects.get(id=ds.id) self.assertTrue(final_context['success'])
def test_no_smash_dupe(self): """ """ job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1237810_T09-1084.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1237811' sample.title = 'GSM1237811' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237811']} ds.aggregate_by = 'ALL' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() final_context = smasher.smash(job.pk, upload=False) dsid = ds.id ds = Dataset.objects.get(id=dsid) self.assertTrue(ds.success) for column in final_context['original_merged'].columns: self.assertTrue('_x' not in column)
def prepare_job(): pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sample_annotation = SampleAnnotation() sample_annotation.data = {'hi': 'friend'} sample_annotation.sample = sample sample_annotation.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1237810_T09-1084.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1237812' sample.title = 'GSM1237812' sample.organism = homo_sapiens sample.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "GSM1237812_S97-PURE.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() computed_file = ComputedFile() computed_file.filename = "GSM1237812_S97-PURE.DAT" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = False computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812']} ds.aggregate_by = 'EXPERIMENT' # [ALL or SPECIES or EXPERIMENT] ds.scale_by = 'STANDARD' # [NONE or MINMAX or STANDARD or ROBUST] ds.email_address = "*****@*****.**" #ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() return pj
def test_create_compendia(self): job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1487313" experiment.save() result = ComputationalResult() result.save() gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS", taxonomy_id=1001) sample = Sample() sample.accession_code = "GSM1487313" sample.title = "GSM1487313" sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487313_liver.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # Missing sample that will be filtered sample = Sample() sample.accession_code = "GSM1487222" sample.title = "this sample will be filtered" sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487222_empty.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/doesnt_exists.PCL" computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # RNASEQ TECH experiment2 = Experiment() experiment2.accession_code = "SRS332914" experiment2.save() result2 = ComputationalResult() result2.save() sample2 = Sample() sample2.accession_code = "SRS332914" sample2.title = "SRS332914" sample2.organism = gallus_gallus sample2.technology = "RNA-SEQ" sample2.save() sra2 = SampleResultAssociation() sra2.sample = sample2 sra2.result = result2 sra2.save() esa2 = ExperimentSampleAssociation() esa2.experiment = experiment2 esa2.sample = sample2 esa2.save() computed_file2 = ComputedFile() computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv" computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename computed_file2.result = result2 computed_file2.size_in_bytes = 234 computed_file2.is_smashable = True computed_file2.save() assoc2 = SampleComputedFileAssociation() assoc2.sample = sample2 assoc2.computed_file = computed_file2 assoc2.save() dset = Dataset() dset.data = { "GSE1487313": ["GSM1487313", "GSM1487222"], "SRX332914": ["SRS332914"] } dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) self.assertFalse(job.success) # check that sample with no computed file was skipped self.assertTrue("GSM1487222" in final_context["filtered_samples"]) self.assertEqual( final_context["filtered_samples"]["GSM1487222"] ["experiment_accession_code"], "GSE1487313", )
def _generate_experiment_and_samples( self, run_accession: str, study_accession: str = None) -> (Experiment, List[Sample]): """Generates Experiments and Samples for the provided run_accession.""" metadata = SraSurveyor.gather_all_metadata(run_accession) if metadata == {}: if study_accession: logger.error("Could not discover any metadata for run.", accession=run_accession, study_accession=study_accession) else: logger.error("Could not discover any metadata for run.", accession=run_accession) return (None, None) # This will cascade properly if DOWNLOAD_SOURCE == "ENA": if metadata["library_layout"] == "PAIRED": files_urls = [ SraSurveyor._build_ena_file_url(run_accession, "_1"), SraSurveyor._build_ena_file_url(run_accession, "_2") ] else: files_urls = [SraSurveyor._build_ena_file_url(run_accession)] else: files_urls = [SraSurveyor._build_ncbi_file_url(run_accession)] # Figure out the Organism for this sample organism_name = metadata.pop("organism_name", None) if not organism_name: logger.error("Could not discover organism type for run.", accession=run_accession) return (None, None) # This will cascade properly organism_name = organism_name.upper() organism = Organism.get_object_for_name(organism_name) ## # Experiment ## experiment_accession_code = metadata.get('study_accession') try: experiment_object = Experiment.objects.get( accession_code=experiment_accession_code) logger.debug( "Experiment already exists, skipping object creation.", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) except Experiment.DoesNotExist: experiment_object = Experiment() experiment_object.accession_code = experiment_accession_code experiment_object.source_url = ENA_URL_TEMPLATE.format( experiment_accession_code) experiment_object.source_database = "SRA" experiment_object.technology = "RNA-SEQ" # We don't get this value from the API, unfortunately. # experiment_object.platform_accession_code = experiment["platform_accession_code"] if not experiment_object.description: experiment_object.description = "No description." if "study_title" in metadata: experiment_object.title = metadata["study_title"] if "study_abstract" in metadata: experiment_object.description = metadata["study_abstract"] if "lab_name" in metadata: experiment_object.submitter_institution = metadata["lab_name"] if "experiment_design_description" in metadata: experiment_object.protocol_description = metadata[ "experiment_design_description"] if "pubmed_id" in metadata: experiment_object.pubmed_id = metadata["pubmed_id"] experiment_object.has_publication = True if "study_ena_first_public" in metadata: experiment_object.source_first_published = parse_datetime( metadata["study_ena_first_public"]) if "study_ena_last_update" in metadata: experiment_object.source_last_modified = parse_datetime( metadata["study_ena_last_update"]) # Rare, but it happens. if not experiment_object.protocol_description: experiment_object.protocol_description = metadata.get( "library_construction_protocol", "Protocol was never provided.") # Scrape publication title and authorship from Pubmed if experiment_object.pubmed_id: pubmed_metadata = utils.get_title_and_authors_for_pubmed_id( experiment_object.pubmed_id) experiment_object.publication_title = pubmed_metadata[0] experiment_object.publication_authors = pubmed_metadata[1] experiment_object.save() ## # Experiment Metadata ## json_xa = ExperimentAnnotation() json_xa.experiment = experiment_object json_xa.data = metadata json_xa.is_ccdl = False json_xa.save() ## # Samples ## sample_accession_code = metadata.pop('run_accession') # Create the sample object try: sample_object = Sample.objects.get( accession_code=sample_accession_code) # If current experiment includes new protocol information, # merge it into the sample's existing protocol_info. protocol_info, is_updated = self.update_sample_protocol_info( sample_object.protocol_info, experiment_object.protocol_description, experiment_object.source_url) if is_updated: sample_object.protocol_info = protocol_info sample_object.save() logger.debug( "Sample %s already exists, skipping object creation.", sample_accession_code, experiment_accession_code=experiment_object.accession_code, survey_job=self.survey_job.id) except Sample.DoesNotExist: sample_object = Sample() sample_object.source_database = "SRA" sample_object.accession_code = sample_accession_code sample_object.organism = organism sample_object.platform_name = metadata.get( "platform_instrument_model", "UNKNOWN") # The platform_name is human readable and contains spaces, # accession codes shouldn't have spaces though: sample_object.platform_accession_code = sample_object.platform_name.replace( " ", "") sample_object.technology = "RNA-SEQ" if "ILLUMINA" in sample_object.platform_name.upper() \ or "NEXTSEQ" in sample_object.platform_name.upper(): sample_object.manufacturer = "ILLUMINA" elif "ION TORRENT" in sample_object.platform_name.upper(): sample_object.manufacturer = "ION_TORRENT" else: sample_object.manufacturer = "UNKNOWN" # Directly apply the harmonized values sample_object.title = harmony.extract_title(metadata) harmonized_sample = harmony.harmonize([metadata]) for key, value in harmonized_sample.items(): setattr(sample_object, key, value) protocol_info, is_updated = self.update_sample_protocol_info( existing_protocols=[], experiment_protocol=experiment_object.protocol_description, experiment_url=experiment_object.source_url) # Do not check is_updated the first time because we must # save a list so we can append to it later. sample_object.protocol_info = protocol_info sample_object.save() for file_url in files_urls: original_file = OriginalFile.objects.get_or_create( source_url=file_url, source_filename=file_url.split('/')[-1], has_raw=True)[0] original_file_sample_association = OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample_object) # Create associations if they don't already exist ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=organism) return experiment_object, [sample_object]
def setup_experiments() -> None: """Creates three experiments for testing purposes. One experiment will not have a GSE* accession code. Both experiments with GSE* accession codes will have two samples. One has a sample that has incorrect platform information so the experiment will need to be re-surveyed. """ organism = Organism.get_object_for_name("HOMO_SAPIENS", taxonomy_id=9606) # Experiment that needs to be re-surveyed experiment = Experiment.objects.create(accession_code="GSE12417", technology="MICROARRAY", source_database="GEO") # Correct platform sample = Sample.objects.create( accession_code="GSM311750", source_database="GEO", technology="MICROARRAY", platform_accession_code="hgu133a", ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample) # Incorrect Platform sample = Sample.objects.create( accession_code="GSM316652", organism=organism, source_database="GEO", technology="MICROARRAY", platform_accession_code="hgu133a", ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample) # Experiment that does not need to be re-surveyed experiment = Experiment.objects.create(accession_code="GSE9890", technology="MICROARRAY", source_database="GEO") sample = Sample.objects.create( accession_code="GSM249671", source_database="GEO", technology="MICROARRAY", platform_accession_code="hgu133plus2", ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample) sample = Sample.objects.create( accession_code="GSM249672", organism=organism, source_database="GEO", technology="MICROARRAY", platform_accession_code="hgu133plus2", ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample) # Experiment that isn't even the right source_database. experiment = Experiment.objects.create(accession_code="SRP12345", technology="RNA-SEQ", source_database="SRA") sample = Sample.objects.create( accession_code="SRR123145", organism=organism, source_database="SRA", technology="RNA-SEQ", platform_accession_code="IlluminaHiSeq1000", ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample) # This second sample is just to make checking things easy because # all experiments start with 2 samples in these tests. sample = Sample.objects.create( accession_code="SRR123146", organism=organism, source_database="SRA", technology="RNA-SEQ", platform_accession_code="IlluminaHiSeq1000", ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample)
def create_experiment_and_samples_from_api( self, experiment_accession_code) -> (Experiment, List[Sample]): """ The main surveyor - find the Experiment and Samples from NCBI GEO. Uses the GEOParse library, for which docs can be found here: https://geoparse.readthedocs.io/en/latest/usage.html#working-with-geo-objects """ # Cleaning up is tracked here: https://github.com/guma44/GEOparse/issues/41 gse = GEOparse.get_GEO(experiment_accession_code, destdir=self.get_temp_path(), how="brief", silent=True) preprocessed_samples = harmony.preprocess_geo(gse.gsms.items()) harmonized_samples = harmony.harmonize(preprocessed_samples) # Create the experiment object try: experiment_object = Experiment.objects.get( accession_code=experiment_accession_code) logger.debug( "Experiment %s already exists, skipping object creation.", experiment_accession_code, survey_job=self.survey_job.id) except Experiment.DoesNotExist: experiment_object = Experiment() experiment_object.accession_code = experiment_accession_code experiment_object.source_url = ( "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=" + experiment_accession_code) experiment_object.source_database = "GEO" experiment_object.title = gse.metadata.get('title', [''])[0] experiment_object.description = gse.metadata.get('summary', [''])[0] # Source doesn't provide time information, assume midnight. submission_date = gse.metadata["submission_date"][ 0] + " 00:00:00 UTC" experiment_object.source_first_published = dateutil.parser.parse( submission_date) last_updated_date = gse.metadata["last_update_date"][ 0] + " 00:00:00 UTC" experiment_object.source_last_updated = dateutil.parser.parse( last_updated_date) unique_institutions = list(set(gse.metadata["contact_institute"])) experiment_object.submitter_institution = ", ".join( unique_institutions) experiment_object.pubmed_id = gse.metadata.get("pubmed_id", [""])[0] # Scrape publication title and authorship from Pubmed if experiment_object.pubmed_id: pubmed_metadata = utils.get_title_and_authors_for_pubmed_id( experiment_object.pubmed_id) experiment_object.publication_title = pubmed_metadata[0] experiment_object.publication_authors = pubmed_metadata[1] experiment_object.save() experiment_annotation = ExperimentAnnotation() experiment_annotation.data = gse.metadata experiment_annotation.experiment = experiment_object experiment_annotation.is_ccdl = False experiment_annotation.save() # Okay, here's the situation! # Sometimes, samples have a direct single representation for themselves. # Othertimes, there is a single file with references to every sample in it. created_samples = [] for sample_accession_code, sample in gse.gsms.items(): try: sample_object = Sample.objects.get( accession_code=sample_accession_code) logger.debug( "Sample %s from experiment %s already exists, skipping object creation.", sample_accession_code, experiment_object.accession_code, survey_job=self.survey_job.id) # Associate it with the experiment, but since it # already exists it already has original files # associated with it and it's already been downloaded, # so don't add it to created_samples. ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=sample_object.organism) except Sample.DoesNotExist: organism = Organism.get_object_for_name( sample.metadata['organism_ch1'][0].upper()) sample_object = Sample() sample_object.source_database = "GEO" sample_object.accession_code = sample_accession_code sample_object.organism = organism # If data processing step, it isn't raw. sample_object.has_raw = not sample.metadata.get( 'data_processing', None) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=organism) sample_object.title = sample.metadata['title'][0] self.set_platform_properties(sample_object, sample.metadata, gse) # Directly assign the harmonized properties harmonized_sample = harmonized_samples[sample_object.title] for key, value in harmonized_sample.items(): setattr(sample_object, key, value) # Sample-level protocol_info sample_object.protocol_info = self.get_sample_protocol_info( sample.metadata, sample_accession_code) sample_object.save() logger.debug("Created Sample: " + str(sample_object)) sample_annotation = SampleAnnotation() sample_annotation.sample = sample_object sample_annotation.data = sample.metadata sample_annotation.is_ccdl = False sample_annotation.save() sample_supplements = sample.metadata.get( 'supplementary_file', []) for supplementary_file_url in sample_supplements: # Why do they give us this? if supplementary_file_url == "NONE": break # We never want these! if "idat.gz" in supplementary_file_url.lower(): continue if "chp.gz" in supplementary_file_url.lower(): continue if "ndf.gz" in supplementary_file_url.lower(): continue if "pos.gz" in supplementary_file_url.lower(): continue if "pair.gz" in supplementary_file_url.lower(): continue if "gff.gz" in supplementary_file_url.lower(): continue # Sometimes, we are lied to about the data processing step. lower_file_url = supplementary_file_url.lower() if '.cel' in lower_file_url \ or ('_non_normalized.txt' in lower_file_url) \ or ('_non-normalized.txt' in lower_file_url) \ or ('-non-normalized.txt' in lower_file_url) \ or ('-non_normalized.txt' in lower_file_url): sample_object.has_raw = True sample_object.save() # filename and source_filename are the same for these filename = supplementary_file_url.split('/')[-1] original_file = OriginalFile.objects.get_or_create( source_url=supplementary_file_url, filename=filename, source_filename=filename, has_raw=sample_object.has_raw, is_archive=True)[0] logger.debug("Created OriginalFile: " + str(original_file)) original_file_sample_association = OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample_object) if original_file.is_affy_data(): # Only Affymetrix Microarrays produce .CEL files sample_object.technology = 'MICROARRAY' sample_object.manufacturer = 'AFFYMETRTIX' sample_object.save() # It's okay to survey RNA-Seq samples from GEO, but we # don't actually want to download/process any RNA-Seq # data unless it comes from SRA. if sample_object.technology != 'RNA-SEQ': created_samples.append(sample_object) # Now that we've determined the technology at the # sample level, we can set it at the experiment level, # just gotta make sure to only do it once. There can # be more than one technology, this should be changed # as part of: # https://github.com/AlexsLemonade/refinebio/issues/1099 if not experiment_object.technology: experiment_object.technology = sample_object.technology experiment_object.save() ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) # These supplementary files _may-or-may-not_ contain the type of raw data we can process. for experiment_supplement_url in gse.metadata.get( 'supplementary_file', []): # filename and source_filename are the same for these filename = experiment_supplement_url.split('/')[-1] original_file = OriginalFile.objects.get_or_create( source_url=experiment_supplement_url, filename=filename, source_filename=filename, has_raw=sample_object.has_raw, is_archive=True)[0] logger.debug("Created OriginalFile: " + str(original_file)) lower_supplement_url = experiment_supplement_url.lower() if ('_non_normalized.txt' in lower_supplement_url) \ or ('_non-normalized.txt' in lower_supplement_url) \ or ('-non-normalized.txt' in lower_supplement_url) \ or ('-non_normalized.txt' in lower_supplement_url): for sample_object in created_samples: sample_object.has_raw = True sample_object.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample_object, original_file=original_file) # Delete this Original file if it isn't being used. if OriginalFileSampleAssociation.objects.filter( original_file=original_file).count() == 0: original_file.delete() # These are the Miniml/Soft/Matrix URLs that are always(?) provided. # GEO describes different types of data formatting as "families" family_url = self.get_miniml_url(experiment_accession_code) miniml_original_file = OriginalFile.objects.get_or_create( source_url=family_url, source_filename=family_url.split('/')[-1], has_raw=sample_object.has_raw, is_archive=True)[0] for sample_object in created_samples: # We don't need a .txt if we have a .CEL if sample_object.has_raw: continue OriginalFileSampleAssociation.objects.get_or_create( sample=sample_object, original_file=miniml_original_file) # Delete this Original file if it isn't being used. if OriginalFileSampleAssociation.objects.filter( original_file=miniml_original_file).count() == 0: miniml_original_file.delete() # Trash the temp path try: shutil.rmtree(self.get_temp_path()) except Exception: # There was a problem during surveying so this didn't get created. # It's not a big deal. pass return experiment_object, created_samples
def test_create_compendia(self): job = ProcessorJob() job.pipeline_applied = "COMPENDIA" job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1487313" experiment.save() result = ComputationalResult() result.save() gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS") sample = Sample() sample.accession_code = 'GSM1487313' sample.title = 'GSM1487313' sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487313_liver.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # RNASEQ TECH experiment2 = Experiment() experiment2.accession_code = "SRS332914" experiment2.save() result2 = ComputationalResult() result2.save() sample2 = Sample() sample2.accession_code = 'SRS332914' sample2.title = 'SRS332914' sample2.organism = gallus_gallus sample2.technology = "RNA-SEQ" sample2.save() sra2 = SampleResultAssociation() sra2.sample = sample2 sra2.result = result2 sra2.save() esa2 = ExperimentSampleAssociation() esa2.experiment = experiment2 esa2.sample = sample2 esa2.save() computed_file2 = ComputedFile() computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv" computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename computed_file2.result = result2 computed_file2.size_in_bytes = 234 computed_file2.is_smashable = True computed_file2.save() assoc2 = SampleComputedFileAssociation() assoc2.sample = sample2 assoc2.computed_file = computed_file2 assoc2.save() dset = Dataset() dset.data = {'GSE1487313': ['GSM1487313'], 'SRX332914': ['SRS332914']} dset.scale_by = 'NONE' dset.aggregate_by = 'SPECIES' dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id)
def test_create_compendia_danio(self): job = ProcessorJob() job.pipeline_applied = "COMPENDIA" job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1234" experiment.save() result = ComputationalResult() result.save() danio_rerio = Organism.get_object_for_name("DANIO_RERIO") micros = [] for file in os.listdir('/home/user/data_store/raw/TEST/MICROARRAY/'): if 'microarray.txt' in file: continue sample = Sample() sample.accession_code = file sample.title = file sample.organism = danio_rerio sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = file computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/MICROARRAY/" + file computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() micros.append(file) experiment = Experiment() experiment.accession_code = "GSE5678" experiment.save() result = ComputationalResult() result.save() rnas = [] for file in os.listdir('/home/user/data_store/raw/TEST/RNASEQ/'): if 'rnaseq.txt' in file: continue sample = Sample() sample.accession_code = file sample.title = file sample.organism = danio_rerio sample.technology = "RNASEQ" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = file computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/RNASEQ/" + file computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() rnas.append(file) result = ComputationalResult() result.save() qn_target = ComputedFile() qn_target.filename = "danio_target.tsv" qn_target.absolute_file_path = '/home/user/data_store/QN/danio_target.tsv' qn_target.is_qn_target = True qn_target.size_in_bytes = "12345" qn_target.sha1 = "aabbccddeeff" qn_target.result = result qn_target.save() cra = ComputationalResultAnnotation() cra.data = {} cra.data['organism_id'] = danio_rerio.id cra.data['is_qn'] = True cra.result = result cra.save() dset = Dataset() dset.data = {'GSE1234': micros, 'GSE5678': rnas} dset.scale_by = 'NONE' dset.aggregate_by = 'SPECIES' dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) # Verify result self.assertEqual(len(final_context['computed_files']), 3) for file in final_context['computed_files']: self.assertTrue(os.path.exists(file.absolute_file_path))
def create_samples_from_api(self, experiment: Experiment, platform_dict: Dict) -> List[Sample]: """Generates a Sample item for each sample in an AE experiment. There are many possible data situations for a sample: - If the sample only has raw data available: - If it is on a platform that we support: Download this raw data and process it - If it is not on a platform we support: Don't download anything, don't process anything - If the sample has both raw and derived data: - If the raw data is on a platform we support: Download the raw data and process it, abandon the derived data - If the raw data is not on a platform we support Download the derived data and no-op it, abandon the raw data - If the sample only has derived data: Download the derived data and no-op it. See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/samples """ created_samples = [] samples_endpoint = SAMPLES_URL.format(experiment.accession_code) r = utils.requests_retry_session().get(samples_endpoint, timeout=60) samples = r.json()["experiment"]["sample"] # The SDRF is the complete metadata record on a sample/property basis. # We run this through our harmonizer and then attach the properties # to our created samples. SDRF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt" sdrf_url = SDRF_URL_TEMPLATE.format(code=experiment.accession_code) sdrf_samples = harmony.parse_sdrf(sdrf_url) harmonized_samples = harmony.harmonize(sdrf_samples) # An experiment can have many samples for sample_data in samples: # For some reason, this sample has no files associated with it. if "file" not in sample_data or len(sample_data['file']) == 0: continue # Each sample is given an experimenatlly-unique title. flat_sample = utils.flatten(sample_data) title = harmony.extract_title(flat_sample) # A sample may actually have many sub files. # If there is raw data, take that. # If not, take the derived. has_raw = False for sub_file in sample_data['file']: # For ex: E-GEOD-15645 if isinstance(sub_file['comment'], list): sub_file_mod = sub_file sub_file_mod['comment'] = sub_file['comment'][0] else: sub_file_mod = sub_file # Some have the 'data' field, but not the actual data # Ex: E-GEOD-9656 if sub_file_mod['type'] == "data" and sub_file_mod[ 'comment'].get('value', None) != None: has_raw = True if 'raw' in sub_file_mod['comment'].get('value', ''): has_raw = True skip_sample = False for sub_file in sample_data['file']: # Don't get the raw data if it's only a 1-color sample. if 'Cy3' in str(sample_data) and 'Cy5' not in str(sample_data): has_raw = False # Skip derived data if we have it raw. if has_raw and "derived data" in sub_file['type']: continue download_url = None filename = sub_file["name"] # sub_file["comment"] is only a list if there's # more than one comment... comments = sub_file["comment"] if isinstance(comments, list): # Could be: "Derived ArrayExpress Data Matrix FTP # file" or: "ArrayExpress FTP file". If there is # no comment with a name including "FTP file" then # we don't know where to download it so we need to # mark this job as an error. Therefore don't catch # the potential exception where download_url # doesn't get defined. for comment in comments: if "FTP file" in comment["name"]: download_url = comment["value"] break else: download_url = comments["value"] if not download_url: logger.error( "Sample %s did not specify a download url, skipping.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sub_file=sub_file) skip_sample = True continue if not filename: logger.error( "Sample %s did not specify a filename, skipping.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sub_file=sub_file) skip_sample = True continue if skip_sample: continue # The accession code is not a simple matter to determine. sample_source_name = sample_data["source"].get("name", "") sample_assay_name = sample_data["assay"].get("name", "") sample_accession_code = self.determine_sample_accession( experiment.accession_code, sample_source_name, sample_assay_name, filename) # Figure out the Organism for this sample organism_name = UNKNOWN for characteristic in sample_data["characteristic"]: if characteristic["category"].upper() == "ORGANISM": organism_name = characteristic["value"].upper() if organism_name == UNKNOWN: logger.error( "Sample %s did not specify the organism name.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id) organism = None continue else: organism = Organism.get_object_for_name(organism_name) # Create the sample object try: # Associate it with the experiment, but since it # already exists it already has original files # associated with it and it's already been downloaded, # so don't add it to created_samples. sample_object = Sample.objects.get( accession_code=sample_accession_code) # If input experiment includes new protocol information, # update sample's protocol_info. existing_protocols = sample_object.protocol_info protocol_info, is_updated = self.update_sample_protocol_info( existing_protocols, experiment.protocol_description, experiment.source_url + '/protocols') if is_updated: sample_object.protocol_info = protocol_info sample_obejct.save() logger.debug( "Sample %s already exists, skipping object creation.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id) except Sample.DoesNotExist: sample_object = Sample() # The basics sample_object.source_database = "ARRAY_EXPRESS" sample_object.title = title sample_object.accession_code = sample_accession_code sample_object.source_archive_url = samples_endpoint sample_object.organism = organism sample_object.platform_name = platform_dict[ "platform_accession_name"] sample_object.platform_accession_code = platform_dict[ "platform_accession_code"] sample_object.manufacturer = platform_dict["manufacturer"] sample_object.technology = "MICROARRAY" protocol_info, is_updated = self.update_sample_protocol_info( existing_protocols=[], experiment_protocol=experiment.protocol_description, protocol_url=experiment.source_url + '/protocols') # Do not check is_updated the first time because we must # save a list so we can append to it later. sample_object.protocol_info = protocol_info sample_object.save() # Directly assign the harmonized properties harmonized_sample = harmonized_samples[title] for key, value in harmonized_sample.items(): setattr(sample_object, key, value) sample_object.save() sample_annotation = SampleAnnotation() sample_annotation.data = sample_data sample_annotation.sample = sample_object sample_annotation.is_ccdl = False sample_annotation.save() original_file = OriginalFile() original_file.filename = filename original_file.source_filename = filename original_file.source_url = download_url original_file.is_downloaded = False original_file.is_archive = True original_file.has_raw = has_raw original_file.save() original_file_sample_association = OriginalFileSampleAssociation( ) original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object original_file_sample_association.save() created_samples.append(sample_object) logger.debug( "Created " + str(sample_object), experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sample=sample_object.id) # Create associations if they don't already exist ExperimentSampleAssociation.objects.get_or_create( experiment=experiment, sample=sample_object) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment, organism=organism) return created_samples
def test_bad_overlap(self): pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sample_annotation = SampleAnnotation() sample_annotation.data = {'hi': 'friend'} sample_annotation.sample = sample sample_annotation.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "big.PCL" computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1237812' sample.title = 'GSM1237812' sample.organism = homo_sapiens sample.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "small.PCL" computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812']} ds.aggregate_by = 'ALL' # [ALL or SPECIES or EXPERIMENT] ds.scale_by = 'NONE' # [NONE or MINMAX or STANDARD or ROBUST] ds.email_address = "*****@*****.**" #ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) ds = Dataset.objects.get(id=ds.id) pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() # Now, make sure the bad can't zero this out. sample = Sample() sample.accession_code = 'GSM999' sample.title = 'GSM999' sample.organism = homo_sapiens sample.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "bad.PCL" computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812', 'GSM999']} ds.aggregate_by = 'ALL' # [ALL or SPECIES or EXPERIMENT] ds.scale_by = 'NONE' # [NONE or MINMAX or STANDARD or ROBUST] ds.email_address = "*****@*****.**" #ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) ds = Dataset.objects.get(id=ds.id) self.assertEqual(len(final_context['final_frame']), 4)
def _perform_imputation(job_context: Dict) -> Dict: """ Take the inputs and perform the primary imputation. Via https://github.com/AlexsLemonade/refinebio/issues/508#issuecomment-435879283: - Combine all microarray samples with a full join to form a microarray_expression_matrix (this may end up being a DataFrame). - Combine all RNA-seq samples (lengthScaledTPM) with a full outer join to form a rnaseq_expression_matrix. - Calculate the sum of the lengthScaledTPM values for each row (gene) of the rnaseq_expression_matrix (rnaseq_row_sums). - Calculate the 10th percentile of rnaseq_row_sums - Drop all rows in rnaseq_expression_matrix with a row sum < 10th percentile of rnaseq_row_sums; this is now filtered_rnaseq_matrix - log2(x + 1) transform filtered_rnaseq_matrix; this is now log2_rnaseq_matrix - Set all zero values in log2_rnaseq_matrix to NA, but make sure to keep track of where these zeroes are - Perform a full outer join of microarray_expression_matrix and log2_rnaseq_matrix; combined_matrix - Remove genes (rows) with >30% missing values in combined_matrix - Remove samples (columns) with >50% missing values in combined_matrix - "Reset" zero values that were set to NA in RNA-seq samples (i.e., make these zero again) in combined_matrix - Transpose combined_matrix; transposed_matrix - Perform imputation of missing values with IterativeSVD (rank=10) on the transposed_matrix; imputed_matrix -- with specified svd algorithm or skip - Untranspose imputed_matrix (genes are now rows, samples are now columns) - Quantile normalize imputed_matrix where genes are rows and samples are columns """ imputation_start = log_state("start perform imputation", job_context["job"].id) job_context["time_start"] = timezone.now() rnaseq_row_sums_start = log_state("start rnaseq row sums", job_context["job"].id) # We potentially can have a microarray-only compendia but not a RNASeq-only compendia log2_rnaseq_matrix = None if job_context["rnaseq_matrix"] is not None: # Drop any genes that are entirely NULL in the RNA-Seq matrix job_context["rnaseq_matrix"] = job_context["rnaseq_matrix"].dropna( axis="columns", how="all") # Calculate the sum of the lengthScaledTPM values for each row # (gene) of the rnaseq_matrix (rnaseq_row_sums) rnaseq_row_sums = np.sum(job_context["rnaseq_matrix"], axis=1) log_state("end rnaseq row sums", job_context["job"].id, rnaseq_row_sums_start) rnaseq_decile_start = log_state("start rnaseq decile", job_context["job"].id) # Calculate the 10th percentile of rnaseq_row_sums rnaseq_tenth_percentile = np.percentile(rnaseq_row_sums, 10) log_state("end rnaseq decile", job_context["job"].id, rnaseq_decile_start) drop_start = log_state("drop all rows", job_context["job"].id) # Drop all rows in rnaseq_matrix with a row sum < 10th # percentile of rnaseq_row_sums; this is now # filtered_rnaseq_matrix # TODO: This is probably a better way to do this with `np.where` rows_to_filter = [] for (x, sum_val) in rnaseq_row_sums.items(): if sum_val < rnaseq_tenth_percentile: rows_to_filter.append(x) del rnaseq_row_sums log_state("actually calling drop()", job_context["job"].id) filtered_rnaseq_matrix = job_context.pop("rnaseq_matrix").drop( rows_to_filter) del rows_to_filter log_state("end drop all rows", job_context["job"].id, drop_start) log2_start = log_state("start log2", job_context["job"].id) # log2(x + 1) transform filtered_rnaseq_matrix; this is now log2_rnaseq_matrix filtered_rnaseq_matrix_plus_one = filtered_rnaseq_matrix + 1 log2_rnaseq_matrix = np.log2(filtered_rnaseq_matrix_plus_one) del filtered_rnaseq_matrix_plus_one del filtered_rnaseq_matrix log_state("end log2", job_context["job"].id, log2_start) cache_start = log_state("start caching zeroes", job_context["job"].id) # Cache our RNA-Seq zero values cached_zeroes = {} for column in log2_rnaseq_matrix.columns: cached_zeroes[column] = log2_rnaseq_matrix.index[np.where( log2_rnaseq_matrix[column] == 0)] # Set all zero values in log2_rnaseq_matrix to NA, but make sure # to keep track of where these zeroes are log2_rnaseq_matrix[log2_rnaseq_matrix == 0] = np.nan log_state("end caching zeroes", job_context["job"].id, cache_start) outer_merge_start = log_state("start outer merge", job_context["job"].id) # Perform a full outer join of microarray_matrix and # log2_rnaseq_matrix; combined_matrix if log2_rnaseq_matrix is not None: combined_matrix = job_context.pop("microarray_matrix").merge( log2_rnaseq_matrix, how="outer", left_index=True, right_index=True) else: logger.info("Building compendia with only microarray data.", job_id=job_context["job"].id) combined_matrix = job_context.pop("microarray_matrix") log_state("ran outer merge, now deleteing log2_rnaseq_matrix", job_context["job"].id) del log2_rnaseq_matrix log_state("end outer merge", job_context["job"].id, outer_merge_start) drop_na_genes_start = log_state("start drop NA genes", job_context["job"].id) # # Visualize Prefiltered # output_path = job_context['output_dir'] + "pre_filtered_" + str(time.time()) + ".png" # visualized_prefilter = visualize.visualize(combined_matrix.copy(), output_path) # Remove genes (rows) with <=70% present values in combined_matrix thresh = combined_matrix.shape[1] * 0.7 # (Rows, Columns) # Everything below `thresh` is dropped row_filtered_matrix = combined_matrix.dropna(axis="index", thresh=thresh) del combined_matrix del thresh log_state("end drop NA genes", job_context["job"].id, drop_na_genes_start) drop_na_samples_start = log_state("start drop NA samples", job_context["job"].id) # # Visualize Row Filtered # output_path = job_context['output_dir'] + "row_filtered_" + str(time.time()) + ".png" # visualized_rowfilter = visualize.visualize(row_filtered_matrix.copy(), output_path) # Remove samples (columns) with <50% present values in combined_matrix # XXX: Find better test data for this! col_thresh = row_filtered_matrix.shape[0] * 0.5 row_col_filtered_matrix_samples = row_filtered_matrix.dropna( axis="columns", thresh=col_thresh) row_col_filtered_matrix_samples_index = row_col_filtered_matrix_samples.index row_col_filtered_matrix_samples_columns = row_col_filtered_matrix_samples.columns log_state("end drop NA genes", job_context["job"].id, drop_na_samples_start) replace_zeroes_start = log_state("start replace zeroes", job_context["job"].id) for sample_accession_code in row_filtered_matrix.columns: if sample_accession_code not in row_col_filtered_matrix_samples_columns: sample = Sample.objects.get(accession_code=sample_accession_code) sample_metadata = sample.to_metadata_dict() job_context["filtered_samples"][sample_accession_code] = { **sample_metadata, "reason": "Sample was dropped because it had less than 50% present values.", "experiment_accession_code": smashing_utils.get_experiment_accession( sample.accession_code, job_context["dataset"].data), } del row_filtered_matrix # # Visualize Row and Column Filtered # output_path = job_context['output_dir'] + "row_col_filtered_" + str(time.time()) + ".png" # visualized_rowcolfilter = visualize.visualize(row_col_filtered_matrix_samples.copy(), # output_path) # "Reset" zero values that were set to NA in RNA-seq samples # (i.e., make these zero again) in combined_matrix for column in cached_zeroes.keys(): zeroes = cached_zeroes[column] # Skip purged columns if column not in row_col_filtered_matrix_samples: continue # Place the zero try: # This generates a warning, so use loc[] instead # row_col_filtered_matrix_samples[column].replace(zeroes, 0.0, inplace=True) zeroes_list = zeroes.tolist() new_index_list = row_col_filtered_matrix_samples_index.tolist() new_zeroes = list(set(new_index_list) & set(zeroes_list)) row_col_filtered_matrix_samples[column].loc[new_zeroes] = 0.0 except Exception: logger.warn("Error when replacing zero") continue log_state("end replace zeroes", job_context["job"].id, replace_zeroes_start) transposed_zeroes_start = log_state("start replacing transposed zeroes", job_context["job"].id) # Label our new replaced data combined_matrix_zero = row_col_filtered_matrix_samples del row_col_filtered_matrix_samples transposed_matrix_with_zeros = combined_matrix_zero.T del combined_matrix_zero # Remove -inf and inf # This should never happen, but make sure it doesn't! transposed_matrix = transposed_matrix_with_zeros.replace([np.inf, -np.inf], np.nan) del transposed_matrix_with_zeros log_state("end replacing transposed zeroes", job_context["job"].id, transposed_zeroes_start) # Store the absolute/percentages of imputed values matrix_sum = transposed_matrix.isnull().sum() percent = (matrix_sum / transposed_matrix.isnull().count()).sort_values(ascending=False) total_percent_imputed = sum(percent) / len(transposed_matrix.count()) job_context["total_percent_imputed"] = total_percent_imputed logger.info("Total percentage of data to impute!", total_percent_imputed=total_percent_imputed) # Perform imputation of missing values with IterativeSVD (rank=10) on the # transposed_matrix; imputed_matrix svd_algorithm = job_context["dataset"].svd_algorithm if svd_algorithm != "NONE": svd_start = log_state("start SVD", job_context["job"].id) logger.info("IterativeSVD algorithm: %s" % svd_algorithm) svd_algorithm = str.lower(svd_algorithm) imputed_matrix = IterativeSVD( rank=10, svd_algorithm=svd_algorithm).fit_transform(transposed_matrix) svd_start = log_state("end SVD", job_context["job"].id, svd_start) else: imputed_matrix = transposed_matrix logger.info("Skipping IterativeSVD") del transposed_matrix untranspose_start = log_state("start untranspose", job_context["job"].id) # Untranspose imputed_matrix (genes are now rows, samples are now columns) untransposed_imputed_matrix = imputed_matrix.T del imputed_matrix # Convert back to Pandas untransposed_imputed_matrix_df = pd.DataFrame.from_records( untransposed_imputed_matrix) untransposed_imputed_matrix_df.index = row_col_filtered_matrix_samples_index untransposed_imputed_matrix_df.columns = row_col_filtered_matrix_samples_columns del untransposed_imputed_matrix del row_col_filtered_matrix_samples_index del row_col_filtered_matrix_samples_columns # Quantile normalize imputed_matrix where genes are rows and samples are columns job_context["organism"] = Organism.get_object_for_name( job_context["organism_name"]) job_context["merged_no_qn"] = untransposed_imputed_matrix_df # output_path = job_context['output_dir'] + "compendia_no_qn_" + str(time.time()) + ".png" # visualized_merged_no_qn = visualize.visualize(untransposed_imputed_matrix_df.copy(), # output_path) log_state("end untranspose", job_context["job"].id, untranspose_start) quantile_start = log_state("start quantile normalize", job_context["job"].id) # Perform the Quantile Normalization job_context = smashing_utils.quantile_normalize(job_context, ks_check=False) log_state("end quantile normalize", job_context["job"].id, quantile_start) # Visualize Final Compendia # output_path = job_context['output_dir'] + "compendia_with_qn_" + str(time.time()) + ".png" # visualized_merged_qn = visualize.visualize(job_context['merged_qn'].copy(), output_path) job_context["time_end"] = timezone.now() job_context["formatted_command"] = ["create_compendia.py"] log_state("end prepare imputation", job_context["job"].id, imputation_start) return job_context
def test_no_smash_all_diff_species(self): """ Smashing together with 'ALL' with different species is a really weird behavior. This test isn't really testing a normal case, just make sure that it's marking the unsmashable files. """ job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1237810_T09-1084.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() experiment = Experiment() experiment.accession_code = "GSE51084" experiment.save() mus_mus = Organism.get_object_for_name("MUS_MUSCULUS") sample = Sample() sample.accession_code = 'GSM1238108' sample.title = 'GSM1238108' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1238108-tbl-1.txt" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810'], 'GSE51084': ['GSM1238108']} ds.aggregate_by = 'ALL' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() final_context = smasher.smash(job.pk, upload=False) dsid = ds.id ds = Dataset.objects.get(id=dsid) print(ds.failure_reason) print(final_context['dataset'].failure_reason) self.assertEqual(final_context['unsmashable_files'], ['GSM1238108'])
def _create_result_objects(job_context: Dict) -> Dict: """ Store and host the result as a ComputationalResult object. """ result_start = log_state("start create result object", job_context["job"].id) result = ComputationalResult() result.commands.append(" ".join(job_context["formatted_command"])) result.is_ccdl = True # Temporary until we re-enable the QN test step. result.is_public = False result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "CREATE_COMPENDIA" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() # Write the compendia dataframe to a file job_context["csv_outfile"] = job_context["output_dir"] + job_context[ "organism_name"] + ".tsv" job_context["merged_qn"].to_csv(job_context["csv_outfile"], sep="\t", encoding="utf-8") organism_key = list(job_context["samples"].keys())[0] annotation = ComputationalResultAnnotation() annotation.result = result annotation.data = { "organism_id": job_context["samples"][organism_key][0].organism_id, "organism_name": job_context["organism_name"], "is_qn": False, "is_compendia": True, "samples": [ sample.accession_code for sample in job_context["samples"][organism_key] ], "num_samples": len(job_context["samples"][organism_key]), "experiment_accessions": [e.accession_code for e in job_context["experiments"]], "total_percent_imputed": job_context["total_percent_imputed"], } annotation.save() # Create the resulting archive final_zip_base = SMASHING_DIR + str( job_context["dataset"].pk) + "_compendia" # Copy LICENSE.txt and correct README.md files. if job_context["dataset"].quant_sf_only: readme_file = "/home/user/README_QUANT.md" else: readme_file = "/home/user/README_NORMALIZED.md" shutil.copy(readme_file, job_context["output_dir"] + "/README.md") shutil.copy("/home/user/LICENSE_DATASET.txt", job_context["output_dir"] + "/LICENSE.TXT") archive_path = shutil.make_archive(final_zip_base, "zip", job_context["output_dir"]) archive_computed_file = ComputedFile() archive_computed_file.absolute_file_path = archive_path archive_computed_file.filename = archive_path.split("/")[-1] archive_computed_file.calculate_sha1() archive_computed_file.calculate_size() archive_computed_file.is_smashable = False archive_computed_file.is_qn_target = False archive_computed_file.result = result archive_computed_file.save() # Compendia Result Helpers primary_organism = Organism.get_object_for_name( job_context["primary_organism"]) organisms = [ Organism.get_object_for_name(organism) for organism in job_context["all_organisms"] ] compendium_version = (CompendiumResult.objects.filter( primary_organism=primary_organism, quant_sf_only=False).count() + 1) # Save Compendia Result compendium_result = CompendiumResult() compendium_result.quant_sf_only = job_context["dataset"].quant_sf_only compendium_result.svd_algorithm = job_context["dataset"].svd_algorithm compendium_result.compendium_version = compendium_version compendium_result.result = result compendium_result.primary_organism = primary_organism compendium_result.save() # create relations to all organisms contained in the compendia compendium_result_organism_associations = [] for compendium_organism in organisms: compendium_result_organism_association = CompendiumResultOrganismAssociation( ) compendium_result_organism_association.compendium_result = compendium_result compendium_result_organism_association.organism = compendium_organism compendium_result_organism_associations.append( compendium_result_organism_association) CompendiumResultOrganismAssociation.objects.bulk_create( compendium_result_organism_associations) job_context["compendium_result"] = compendium_result logger.info("Compendium created!", archive_path=archive_path, organism_name=job_context["organism_name"]) # Upload the result to S3 timestamp = str(int(time.time())) key = job_context["organism_name"] + "_" + str( compendium_version) + "_" + timestamp + ".zip" uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, key) if not uploaded_to_s3: raise utils.ProcessorJobError( "Failed to upload compendia to S3", success=False, computed_file_id=archive_computed_file.id, ) if settings.RUNNING_IN_CLOUD: archive_computed_file.delete_local_file() job_context["result"] = result job_context["success"] = True log_state("end create result object", job_context["job"].id, result_start) # TEMPORARY for iterating on compendia more quickly. # Reset this so the end_job does clean up the job's non-input-data stuff. job_context["work_dir"] = job_context["old_work_dir"] return job_context
def test_no_smash_dupe_two(self): """ Tests the SRP051449 case, where the titles collide. Also uses a real QN target file.""" job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() experiment = Experiment() experiment.accession_code = "SRP051449" experiment.save() result = ComputationalResult() result.save() danio_rerio = Organism.get_object_for_name("DANIO_RERIO") sample = Sample() sample.accession_code = 'SRR1731761' sample.title = 'Danio rerio' sample.organism = danio_rerio sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "SRR1731761_output_gene_lengthScaledTPM.tsv" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'SRR1731762' sample.title = 'Danio rerio' sample.organism = danio_rerio sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "SRR1731762_output_gene_lengthScaledTPM.tsv" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'SRP051449': ['SRR1731761', 'SRR1731762']} ds.aggregate_by = 'SPECIES' ds.scale_by = 'NONE' ds.email_address = "*****@*****.**" ds.quantile_normalize = True ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() cr = ComputationalResult() cr.save() computed_file = ComputedFile() computed_file.filename = "danio_target.tsv" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = cr computed_file.size_in_bytes = 123 computed_file.is_smashable = False computed_file.save() cra = ComputationalResultAnnotation() cra.data = {'organism_id': danio_rerio.id, 'is_qn': True} cra.result = cr cra.save() final_context = smasher.smash(job.pk, upload=False) self.assertTrue(final_context['success'])
def test_salmon_quant_two_samples_single_read(self): """Test `salmon quant` outputs on two samples that have single read and that belong to same experiment. """ prepare_organism_indices() # Create one experiment and two related samples, based on: # https://www.ncbi.nlm.nih.gov/sra/?term=SRP040623 # (For testing purpose, only two of the four samples' data are included.) experiment_accession = 'PRJNA242809' experiment = Experiment.objects.create(accession_code=experiment_accession) c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") ## Sample 1 sample1_accession = 'SRR1206053' sample1 = Sample.objects.create(accession_code=sample1_accession, organism=c_elegans) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample1) experiment_dir = "/home/user/data_store/salmon_tests/PRJNA242809" og_file_1 = OriginalFile() og_file_1.absolute_file_path = os.path.join(experiment_dir, "raw/SRR1206053.fastq.gz") og_file_1.filename = "SRR1206053.fastq.gz" og_file_1.save() OriginalFileSampleAssociation.objects.create(original_file=og_file_1, sample=sample1).save() ## Sample 2 sample2_accession = 'SRR1206054' sample2 = Sample.objects.create(accession_code=sample2_accession, organism=c_elegans) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample2) og_file_2 = OriginalFile() og_file_2.absolute_file_path = os.path.join(experiment_dir, "raw/SRR1206054.fastq.gz") og_file_2.filename = "SRR1206054.fastq.gz" og_file_2.save() OriginalFileSampleAssociation.objects.create(original_file=og_file_2, sample=sample2).save() # Test `salmon quant` on sample1 (SRR1206053) sample1_dir = os.path.join(experiment_dir, sample1_accession) job1_context = salmon._prepare_files({"job_dir_prefix": "TEST", "job_id": "TEST", 'pipeline': Pipeline(name="Salmon"), 'computed_files': [], "original_files": [og_file_1]}) # Check quant.sf in `salmon quant` output dir of sample1 self.check_salmon_quant(job1_context, sample1_dir) # Confirm that this experiment is not ready for tximport yet. experiments_ready = salmon._get_tximport_inputs(job1_context) self.assertEqual(len(experiments_ready), 0) # This job should not have produced any tximport output # because the other sample isn't ready yet. self.assertFalse(os.path.exists(os.path.join(job1_context["work_dir"], 'txi_out.RDS'))) # Now run `salmon quant` on sample2 (SRR1206054) too sample2_dir = os.path.join(experiment_dir, sample2_accession) job2_context = salmon._prepare_files({"job_dir_prefix": "TEST2", "job_id": "TEST2", 'pipeline': Pipeline(name="Salmon"), 'computed_files': [], "original_files": [og_file_2]}) # Clean up tximport output: rds_filename = os.path.join(job2_context["work_dir"], 'txi_out.RDS') if (os.path.isfile(rds_filename)): os.remove(rds_filename) # Check quant.sf in `salmon quant` output dir of sample2 self.check_salmon_quant(job2_context, sample2_dir) # rds_filename should have been generated by tximport at this point. # Note: `tximport` step is launched by subprocess module in Python. # If input "quant.sf" files are too large, we may have to wait for # a few seconds before testing the existence of rds_filename. self.assertTrue(os.path.exists(rds_filename)) for computed_file in job2_context['computed_files']: if computed_file.filename[-4:] == '.RDS': rds_file_path = computed_file.absolute_file_path cmd_tokens = [ "/usr/bin/Rscript", "--vanilla", "/home/user/data_refinery_workers/processors/test_tximport.R", "--txi_out", rds_file_path, "--gene2txmap", job2_context["genes_to_transcripts_path"] ] tximport_test_result = subprocess.run(cmd_tokens, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if tximport_test_result.returncode != 0: # If the exit code is not 0 then tximport failed so fail the tests. self.assertTrue(False) # Check the individual files self.assertTrue(len(job2_context['individual_files']), 2) for file in job2_context['individual_files']: self.assertTrue(os.path.isfile(file.absolute_file_path))
def test_dualtech_smash(self): """ """ pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1487313" experiment.save() result = ComputationalResult() result.save() gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS") sample = Sample() sample.accession_code = 'GSM1487313' sample.title = 'GSM1487313' sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487313_liver.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # RNASEQ TECH experiment2 = Experiment() experiment2.accession_code = "SRS332914" experiment2.save() result2 = ComputationalResult() result2.save() sample2 = Sample() sample2.accession_code = 'SRS332914' sample2.title = 'SRS332914' sample2.organism = gallus_gallus sample2.technology = "RNA-SEQ" sample2.save() sra2 = SampleResultAssociation() sra2.sample = sample2 sra2.result = result2 sra2.save() esa2 = ExperimentSampleAssociation() esa2.experiment = experiment2 esa2.sample = sample2 esa2.save() computed_file2 = ComputedFile() computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv" computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename computed_file2.result = result2 computed_file2.size_in_bytes = 234 computed_file2.is_smashable = True computed_file2.save() assoc2 = SampleComputedFileAssociation() assoc2.sample = sample2 assoc2.computed_file = computed_file2 assoc2.save() # CROSS-SMASH BY SPECIES ds = Dataset() ds.data = {'GSE1487313': ['GSM1487313'], 'SRX332914': ['SRS332914']} ds.aggregate_by = 'SPECIES' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() self.assertTrue(ds.is_cross_technology()) final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(os.path.exists(final_context['output_file'])) os.remove(final_context['output_file']) self.assertEqual(len(final_context['final_frame'].columns), 2) # THEN BY EXPERIMENT ds.aggregate_by = 'EXPERIMENT' ds.save() dsid = ds.id ds = Dataset.objects.get(id=dsid) pj.start_time = None pj.end_time = None pj.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(os.path.exists(final_context['output_file'])) os.remove(final_context['output_file']) self.assertEqual(len(final_context['final_frame'].columns), 1) # THEN BY ALL ds.aggregate_by = 'ALL' ds.save() dsid = ds.id ds = Dataset.objects.get(id=dsid) pj.start_time = None pj.end_time = None pj.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(os.path.exists(final_context['output_file'])) self.assertEqual(len(final_context['final_frame'].columns), 2)
def test_qn_reference(self): job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") experiment = Experiment() experiment.accession_code = "12345" experiment.save() for code in ['1', '2', '3', '4', '5', '6']: sample = Sample() sample.accession_code = code sample.title = code sample.platform_accession_code = 'A-MEXP-1171' sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS" sample.organism = homo_sapiens sample.technology = "MICROARRAY" sample.is_processed = True sample.save() cr = ComputationalResult() cr.save() file = ComputedFile() file.filename = code + ".tsv" file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv" file.size_in_bytes = int(code) file.result = cr file.is_smashable = True file.save() scfa = SampleComputedFileAssociation() scfa.sample = sample scfa.computed_file = file scfa.save() exsa = ExperimentSampleAssociation() exsa.experiment = experiment exsa.sample = sample exsa.save() dataset = Dataset() dataset.data = {"12345": ["1", "2", "3", "4", "5", "6"]} dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False # We don't QN because we're creating the target now dataset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() final_context = qn_reference.create_qn_reference(job.pk) self.assertTrue(final_context['success']) self.assertTrue(os.path.exists(final_context['target_file'])) self.assertEqual(os.path.getsize(final_context['target_file']), 556) target = utils.get_most_recent_qn_target_for_organism(homo_sapiens) self.assertEqual(target.sha1, '636d72d5cbf4b9785b0bd271a1430b615feaa7ea') ### # Smasher with QN ### pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() ds = Dataset() ds.data = {"12345": ["1", "2", "3", "4", "5"]} ds.aggregate_by = 'SPECIES' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = True ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(final_context['success']) self.assertEqual(final_context['merged_qn']['1'][0], -0.4379488528812934) self.assertEqual(final_context['original_merged']['1'][0], -0.576210936113982) ## # Test via management command ## from django.core.management import call_command from django.test import TestCase from django.utils.six import StringIO out = StringIO() try: call_command('create_qn_target', organism='homo_sapiens', min=1, stdout=out) except SystemExit as e: # this is okay! pass stdout = out.getvalue() self.assertTrue('Target file' in stdout) path = stdout.split('\n')[0].split(':')[1].strip() self.assertTrue(os.path.exists(path)) self.assertEqual(path, utils.get_most_recent_qn_target_for_organism(homo_sapiens).absolute_file_path)
def setup_experiment(new_version_accessions: List[str], old_version_accessions: List[str]) -> Dict: """ Create an experiment where some samples were processed with the newest version of salmon and other with an older one. """ # Create the experiment experiment_accession = "SRP095529" data_dir = "/home/user/data_store/" experiment_dir = data_dir + experiment_accession experiment = Experiment.objects.create(accession_code=experiment_accession, technology="RNA-SEQ") zebrafish = Organism.get_object_for_name("DANIO_RERIO") # Create the transcriptome processor and result: transcriptome_processor = Processor() transcriptome_processor.name = "Transcriptome" transcriptome_processor.version = "salmon 0.9.1" transcriptome_processor.docker_image = "dr_transcriptome" transcriptome_processor.environment = '{"some": "environment"}' transcriptome_processor.save() computational_result_short = ComputationalResult( processor=transcriptome_processor) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = zebrafish organism_index.result = computational_result_short organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT" organism_index.salmon_version = "salmon 0.9.1" organism_index.save() comp_file = ComputedFile() # This path will not be used because we already have the files extracted. comp_file.absolute_file_path = ( "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz") comp_file.result = computational_result_short comp_file.size_in_bytes = 1337 comp_file.sha1 = "ABC" comp_file.s3_key = "key" comp_file.s3_bucket = "bucket" comp_file.save() quant_processor = Processor() quant_processor.name = "Salmon Quant" quant_processor.version = "salmon 0.9.1" quant_processor.docker_image = "dr_salmon" quant_processor.environment = '{"some": "environment"}' quant_processor.save() for accession_code in old_version_accessions: sample = Sample.objects.create( accession_code=accession_code, organism=zebrafish, source_database="SRA", technology="RNA-SEQ", platform_accession_code="IlluminaHiSeq1000", ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample) original_file = OriginalFile() original_file.filename = accession_code + ".SRA" original_file.source_filename = accession_code + ".SRA" original_file.save() OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample) # Create and associate quant result and files. quant_result = ComputationalResult() quant_result.is_ccdl = True quant_result.processor = quant_processor quant_result.organism_index = organism_index # associate with OLD organism index quant_result.save() kv = ComputationalResultAnnotation() kv.data = {"index_length": "short"} kv.result = quant_result kv.is_public = True kv.save() # In prod the filename pattern will involve the timestamp # but here we're using the accession code so we can find # the archive file for the current sample. archive_filename = "result-" + accession_code + ".tar.gz" archive_file = ComputedFile() archive_file.filename = archive_filename archive_file.absolute_file_path = os.path.join(experiment_dir, archive_filename) archive_file.is_public = False archive_file.is_smashable = False archive_file.is_qc = False archive_file.result = quant_result archive_file.size_in_bytes = 12345 archive_file.save() quant_file = ComputedFile() quant_file.filename = "quant.sf" quant_file.absolute_file_path = (experiment_dir + "/quant_files/" + accession_code + "_output/quant.sf") quant_file.is_public = False quant_file.is_smashable = False quant_file.is_qc = False quant_file.result = quant_result quant_file.size_in_bytes = 12345 quant_file.s3_bucket = "bucket" quant_file.s3_key = "key" quant_file.save() SampleResultAssociation.objects.get_or_create(sample=sample, result=quant_result) # Create another OrganismIndex with a newer version of transcriptome_processor = Processor() transcriptome_processor.name = "Transcriptome" transcriptome_processor.version = "salmon 0.13.1" transcriptome_processor.docker_image = "dr_transcriptome" transcriptome_processor.environment = '{"some": "environment"}' transcriptome_processor.save() computational_result_short = ComputationalResult( processor=transcriptome_processor) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = zebrafish organism_index.result = computational_result_short organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT" organism_index.salmon_version = "salmon 0.13.1" # DIFFERENT SALMON VERSION organism_index.save() comp_file = ComputedFile() # This path will not be used because we already have the files extracted. comp_file.absolute_file_path = ( "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz") comp_file.result = computational_result_short comp_file.size_in_bytes = 1337 comp_file.sha1 = "ABC" comp_file.s3_key = "key" comp_file.s3_bucket = "bucket" comp_file.save() for accession_code in new_version_accessions: sample = Sample.objects.create( accession_code=accession_code, organism=zebrafish, source_database="SRA", technology="RNA-SEQ", platform_accession_code="IlluminaHiSeq1000", ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample) original_file = OriginalFile() original_file.filename = accession_code + ".SRA" original_file.source_filename = accession_code + ".SRA" original_file.save() OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample) # Create and associate quant result and files. quant_result = ComputationalResult() quant_result.is_ccdl = True quant_result.processor = quant_processor quant_result.organism_index = organism_index # NEWER VERSION quant_result.save() kv = ComputationalResultAnnotation() kv.data = {"index_length": "short"} kv.result = quant_result kv.is_public = True kv.save() # In prod the filename pattern will involve the timestamp # but here we're using the accession code so we can find # the archive file for the current sample. archive_filename = "result-" + accession_code + ".tar.gz" archive_file = ComputedFile() archive_file.filename = archive_filename archive_file.absolute_file_path = os.path.join(experiment_dir, archive_filename) archive_file.is_public = False archive_file.is_smashable = False archive_file.is_qc = False archive_file.result = quant_result archive_file.size_in_bytes = 12345 archive_file.save() quant_file = ComputedFile() quant_file.filename = "quant.sf" quant_file.absolute_file_path = (experiment_dir + "/quant_files/" + accession_code + "_output/quant.sf") quant_file.is_public = False quant_file.is_smashable = False quant_file.is_qc = False quant_file.result = quant_result quant_file.size_in_bytes = 12345 quant_file.s3_bucket = "bucket" quant_file.s3_key = "key" quant_file.save() SampleResultAssociation.objects.get_or_create(sample=sample, result=quant_result) return experiment
def test_make_experiment_result_associations(self): """Tests that the correct associations are made. The situation we're setting up is basically this: * tximport has been run for an experiment. * It made associations between the samples in the experiment and the ComputationalResult. * It didn't make associations between the experiment itself and the ComputationalResult. * There is a second experiment that hasn't had tximport run but shares a sample with the other experiment. * This second experiment has a sample which has not yet had tximport run on it. And what we're going to test for is: * An association is created between the tximport result and the first experiment. * An association is NOT created between the tximport result and the second experiment. """ # Get an organism to set on samples: homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS", taxonomy_id=9606) # Create the tximport processor and result: processor = Processor() processor.name = "Tximport" processor.version = "v9.9.9" processor.docker_image = "dr_salmon" processor.environment = '{"some": "environment"}' processor.save() result = ComputationalResult() result.commands.append("tximport invocation") result.is_ccdl = True result.processor = processor result.save() # Create the first experiment and it's samples: processed_experiment = Experiment() processed_experiment.accession_code = "SRP12345" processed_experiment.save() processed_sample_one = Sample() processed_sample_one.accession_code = "SRX12345" processed_sample_one.title = "SRX12345" processed_sample_one.organism = homo_sapiens processed_sample_one.save() sra = SampleResultAssociation() sra.sample = processed_sample_one sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = processed_experiment esa.sample = processed_sample_one esa.save() processed_sample_two = Sample() processed_sample_two.accession_code = "SRX12346" processed_sample_two.title = "SRX12346" processed_sample_two.organism = homo_sapiens processed_sample_two.save() sra = SampleResultAssociation() sra.sample = processed_sample_two sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = processed_experiment esa.sample = processed_sample_two esa.save() # Create the second experiment and it's additional sample. unprocessed_experiment = Experiment() unprocessed_experiment.accession_code = "SRP6789" unprocessed_experiment.save() unprocessed_sample = Sample() unprocessed_sample.accession_code = "SRX6789" unprocessed_sample.title = "SRX6789" unprocessed_sample.organism = homo_sapiens unprocessed_sample.save() sra = SampleResultAssociation() sra.sample = unprocessed_sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = unprocessed_experiment esa.sample = unprocessed_sample esa.save() esa = ExperimentSampleAssociation() esa.experiment = unprocessed_experiment esa.sample = processed_sample_two esa.save() # Run the function we're testing: make_experiment_result_associations() # Test that only one association was created and that it was # to the processed experiment: eras = ExperimentResultAssociation.objects.all() self.assertEqual(len(eras), 1) self.assertEqual(eras.first().experiment, processed_experiment)
def _populate_index_object(job_context: Dict) -> Dict: """ """ result = ComputationalResult() result.commands.append(job_context["salmon_formatted_command"]) try: processor_key = "TX_INDEX" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.is_ccdl = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] result.save() job_context['pipeline'].steps.append(result.id) computed_file = ComputedFile() computed_file.absolute_file_path = job_context["computed_archive"] computed_file.filename = os.path.split(job_context["computed_archive"])[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.result = result computed_file.is_smashable = False computed_file.is_qc = False computed_file.save() organism_object = Organism.get_object_for_name(job_context['organism_name']) index_object = OrganismIndex() index_object.organism = organism_object index_object.source_version = job_context["assembly_version"] index_object.assembly_name = job_context["assembly_name"] index_object.salmon_version = job_context["salmon_version"] index_object.index_type = "TRANSCRIPTOME_" + job_context['length'].upper() # This is where the index will be extracted to. index_object.absolute_directory_path = LOCAL_ROOT_DIR + "/TRANSCRIPTOME_INDEX/" \ + organism_object.name + "/" + job_context['length'] index_object.result = result if S3_TRANSCRIPTOME_INDEX_BUCKET_NAME: logger.info("Uploading %s %s to s3", job_context['organism_name'], job_context['length'], processor_job=job_context["job_id"]) timestamp = str(timezone.now().timestamp()).split('.')[0] s3_key = organism_object.name + '_' + index_object.index_type + "_" + timestamp + '.tar.gz' sync_result = computed_file.sync_to_s3(S3_TRANSCRIPTOME_INDEX_BUCKET_NAME, s3_key) if sync_result: computed_file.delete_local_file() else: logger.warn("S3_TRANSCRIPTOME_INDEX_BUCKET_NAME not configured, therefore %s %s will not be uploaded.", job_context['organism_name'], job_context['length'], processor_job=job_context["job_id"]) index_object.save() # We uploaded the file ourselves since we wanted it to go to a # different bucket than end_job would put it in, therefore empty # this list so end_job doesn't try to upload it again. job_context['computed_files'] = [] job_context['result'] = result job_context['computed_file'] = computed_file job_context['index'] = index_object # If there's not a long and a short index for this organism yet, # don't delete the input. # XXX: This will break once we introduce additional versions of these. short_indices = OrganismIndex.objects.filter(organism=organism_object, index_type="TRANSCRIPTOME_SHORT", source_version=job_context["assembly_version"]) long_indices = OrganismIndex.objects.filter(organism=organism_object, index_type="TRANSCRIPTOME_LONG", source_version=job_context["assembly_version"]) if short_indices.count() < 1 or long_indices.count() < 1: # utils.end_job deletes these, so remove them so it doesn't. job_context["original_files"] = [] return job_context
def handle(self, *args, **options): """ """ if not options["job_id"]: if options["organism"] is None and not options["all"]: logger.error("You must specify an organism or --all") sys.exit(1) if options["organism"] and (options.get("organism", "") != "ALL"): organisms = [ Organism.get_object_for_name(options["organism"].upper()) ] else: organisms = Organism.objects.all() for organism in organisms: if not organism_can_have_qn_target(organism): logger.error( "Organism does not have any platform with enough samples to generate a qn target", organism=organism, min=options["min"], ) continue samples = organism.sample_set.filter(has_raw=True, technology="MICROARRAY", is_processed=True) if samples.count() == 0: logger.error( "No processed samples for organism.", organism=organism, count=samples.count(), ) continue if options["platform"] is None: platform_counts = ( samples.values("platform_accession_code").annotate( dcount=Count("platform_accession_code")).order_by( "-dcount")) biggest_platform = platform_counts[0][ "platform_accession_code"] else: biggest_platform = options["platform"] sample_codes_results = Sample.processed_objects.filter( platform_accession_code=biggest_platform, has_raw=True, technology="MICROARRAY", organism=organism, is_processed=True, ).values("accession_code") sample_codes = [ res["accession_code"] for res in sample_codes_results ] dataset = Dataset() dataset.data = { organism.name + "_(" + biggest_platform + ")": sample_codes } dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False dataset.save() job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() final_context = qn_reference.create_qn_reference(job.pk) if final_context["success"]: print(":D") self.stdout.write("Target file: " + final_context["target_file"]) self.stdout.write( "Target S3: " + str(final_context["computed_files"][0].get_s3_url())) else: print(":(") else: qn_reference.create_qn_reference(options["job_id"])