def __init__(self, dmp_file): self.files = FileRepository.all() self.dmp_file = dmp_file self.bam_path = dmp_file.file.path self.metadata = dmp_file.metadata self.mutations_extended = self._set_data_muts_txt() self.dmp_sample_name = self._set_dmp_sample_name()
def __init__( self, model, job_group_id=None, job_group_notifier_id=None, request_id=None, run_ids=[], pipeline=None, pairing=None, output_directory_prefix=None, ): if not isinstance(model, OperatorModel): raise Exception( "Must pass an instance of beagle_etl.models.Operator") self.model = model self.request_id = request_id self.job_group_id = job_group_id self.job_group_notifier_id = job_group_notifier_id self.run_ids = run_ids self.files = FileRepository.all() self.pairing = pairing # {"pairs": [{"tumor": "tumorSampleName", "normal": "normalSampleName"}]} self.output_directory_prefix = output_directory_prefix self._jobs = [] self._pipeline = pipeline
def __init__(self, sample_id): self.files = FileRepository.all() self.sample_id = sample_id self.patient_id, self.cmo_sample_name = self._get_sample_metadata() self.dmp_patient_id = self._get_dmp_patient_id() self.dmp_bams_tumor = self._find_dmp_bams("T") self.dmp_bams_normal = self._find_dmp_bams("N")
def _get_request_id(self): files = FileRepository.all() request_ids = set() for run_id in self.run_ids: run = Run.objects.filter(id=run_id)[0] sample_name = run.tags['sampleNameTumor'] sample_files = FileRepository.filter(queryset=files, metadata = {'cmoSampleName': sample_name}) for f in sample_files: metadata = f.metadata if 'requestId' in metadata: request_ids.add(metadata['requestId']) request_id = "_".join(list(request_ids)) return request_id
def _get_samples_data(self): files = FileRepository.all() f = FileRepository.filter(queryset=files, metadata={ "cmoSampleName": self.tumor_sample_name, "igocomplete": True }, filter_redact=True) sample = None if f: # retrieve metadata from first record (should only be one) meta = f[0].metadata sample_id = meta["sampleId"] sample = SampleData(sample_id) return sample
def get_dmp_bam(patient_id, bait_set, tumor_type): """ From a patient id and bait set, get matching dmp bam normal """ file_objs = FileRepository.all() dmp_query = build_dmp_query(patient_id, bait_set) dmp_bam = FileRepository.filter(queryset=file_objs, q=dmp_query).order_by('file__file_name').first() if dmp_bam: sample = build_dmp_sample(dmp_bam, patient_id, bait_set, tumor_type) built_sample = build_sample([sample], ignore_sample_formatting=True) return built_sample return None
def get_pooled_normal_files(run_ids, preservation_types, bait_set): pooled_normals = FileRepository.all() query = Q(file__file_group=settings.POOLED_NORMAL_FILE_GROUP) run_id_query = build_run_id_query(run_ids) preservation_query = build_preservation_query(preservation_types) q = query & run_id_query & preservation_query pooled_normals = FileRepository.filter(queryset=pooled_normals, q=q) pooled_normals, descriptor, sample_name = get_descriptor(bait_set, pooled_normals, preservation_types, run_ids) return pooled_normals, descriptor, sample_name
def get_dmp_normal(patient_id, bait_set): """ From a patient id and bait set, get matching dmp bam normal """ file_objs = FileRepository.all() dmp_query = build_dmp_query(patient_id, bait_set) dmp_bam = FileRepository.filter( queryset=file_objs, q=dmp_query).order_by("file__file_name").first() if dmp_bam: dmp_metadata = dmp_bam.metadata specimen_type = "DMP Normal" sample_name = dmp_metadata["external_id"] sequencingCenter = "MSKCC" platform = "Illumina" sample = dict() sample["id"] = dmp_bam.file.id sample["path"] = dmp_bam.file.path sample["file_name"] = dmp_bam.file.file_name sample["file_type"] = dmp_bam.file.file_type metadata = init_metadata() metadata["sampleId"] = sample_name metadata["sampleName"] = format_sample_name(sample_name, specimen_type) metadata["requestId"] = sample_name metadata["sequencingCenter"] = sequencingCenter metadata["platform"] = platform metadata["baitSet"] = bait_set metadata["recipe"] = bait_set metadata["run_id"] = "" metadata["preservation"] = "" metadata["libraryId"] = sample_name + "_1" metadata["R"] = "Not applicable" # because rgid depends on flowCellId and barcodeIndex, we will # spoof barcodeIndex so that pairing can work properly; see # build_sample in runner.operator.argos_operator.bin metadata["barcodeIndex"] = "DMP_BARCODEIDX" metadata["flowCellId"] = "DMP_FCID" metadata["tumorOrNormal"] = "Normal" metadata["patientId"] = patient_id metadata["specimenType"] = specimen_type sample["metadata"] = metadata built_sample = build_sample([sample], ignore_sample_formatting=True) return built_sample return None
def get_oncotree_codes(request_id): oncotree_dh = OncotreeDataHandler() files = FileRepository.all() oncotree_codes_tmp = set( FileRepository.filter(queryset=files, metadata={"requestId": request_id}).values_list( "metadata__oncoTreeCode", flat=True ) ) oncotree_codes = list() for val in oncotree_codes_tmp: if val: oncotree_codes.append(val) if not oncotree_codes: # hack; if there are no oncotree codes, just say it's mixed return "mixed" shared_nodes = oncotree_dh.find_shared_nodes_by_code_list(oncotree_codes) common_anc = oncotree_dh.get_highest_level_shared_node(shared_nodes) if common_anc.code.lower() == "tissue": common_anc.code = "mixed" return common_anc.code.lower()
def get_request_pi(run_id_list): request_pis = set() files = FileRepository.all() all_request_ids = set() # reducing number of queries for run_id in run_id_list: argos_run = Run.objects.get(id=run_id) run_request_id = argos_run.tags["requestId"] all_request_ids.add(run_request_id) for request_id in all_request_ids: investigator_emails = FileRepository.filter(queryset=files, metadata={"requestId": request_id}).values_list( "metadata__investigatorEmail", flat=True ) request_pis = request_pis.union(set(investigator_emails)) request_pis_final = list() for request_pi in request_pis: if request_pi: request_pis_final.append(format_msk_id(request_pi)) return ",".join(request_pis_final)
def get_samples_from_patient_id(patient_id): """ Retrieves samples from the database based on the patient_id Only retrieve patients from LIMS file group """ all_files = FileRepository.all() q_pid = Q(metadata__patientId=patient_id) q_fg = build_argos_file_groups_query() q = q_pid & q_fg files = FileRepository.filter(queryset=all_files, q=q, filter_redact=True) data = list() for current_file in files: sample = dict() sample["id"] = current_file.file.id sample["path"] = current_file.file.path sample["file_name"] = current_file.file.file_name sample["metadata"] = current_file.metadata data.append(sample) samples = list() # group by igoId igo_id_group = dict() for sample in data: igo_id = sample["metadata"]["sampleId"] if igo_id not in igo_id_group: igo_id_group[igo_id] = list() igo_id_group[igo_id].append(sample) for igo_id in igo_id_group: samples.append(build_sample(igo_id_group[igo_id])) samples, bad_samples = remove_with_caveats(samples) number_of_bad_samples = len(bad_samples) if number_of_bad_samples > 0: LOGGER.warning( "Some samples for patient query %s have invalid %i values", patient_id, number_of_bad_samples) return samples
def get_jobs(self, pairing_override=None): logger.info("Operator JobGroupNotifer ID %s", self.job_group_notifier_id) tmpdir = os.path.join(settings.BEAGLE_SHARED_TMPDIR, str(uuid.uuid4())) self.OUTPUT_DIR = tmpdir Path(self.OUTPUT_DIR).mkdir(parents=True, exist_ok=True) recipe_query = self.build_recipe_query() assay_query = self.build_assay_query() igocomplete_query = Q(metadata__igocomplete=True) missing_fields_query = self.filter_out_missing_fields_query() q = recipe_query & assay_query & igocomplete_query & missing_fields_query files = FileRepository.all() tempo_files = FileRepository.filter(queryset=files, q=q) self.send_message(""" Querying database for the following recipes: {recipes} Querying database for the following assays/bait sets: {assays} """.format(recipes="\t\n".join(self.get_recipes()), assays="\t\n".join(self.get_assays()))) exclude_query = self.get_exclusions() if exclude_query: tempo_files = tempo_files.exclude(exclude_query) # replace with run operator logic, most recent pairing pre_pairing = self.load_pairing_file( PAIRING_FILE_LOCATION) # pairing.tsv is not in repo if pairing_override: normal_samples = pairing_override['normal_samples'] tumor_samples = pairing_override['tumor_samples'] num_ns = len(normal_samples) num_ts = len(tumor_samples) if num_ns != num_ts: print("Number of tumors and normals not the same; can't pair") else: for i in range(0, num_ns): tumor_id = tumor_samples[i] normal_id = normal_samples[i] pre_pairing[tumor_id] = normal_id patient_ids = set() patient_files = dict() no_patient_samples = list() for entry in tempo_files: patient_id = entry.metadata['patientId'] if patient_id: patient_ids.add(patient_id) if patient_id not in patient_files: patient_files[patient_id] = list() patient_files[patient_id].append(entry) else: no_patient_samples.append(entry) self.patients = dict() self.non_cmo_patients = dict() for patient_id in patient_files: if "C-" in patient_id[:2]: self.patients[patient_id] = patient_obj.Patient( patient_id, patient_files[patient_id], pre_pairing) else: self.non_cmo_patients[patient_id] = patient_obj.Patient( patient_id, patient_files[patient_id]) input_json = dict() # output these strings to file input_json['conflict_data'] = self.create_conflict_samples_txt_file() input_json['unpaired_data'] = self.create_unpaired_txt_file() input_json['mapping_data'] = self.create_mapping_file() input_json['pairing_data'] = self.create_pairing_file() input_json['tracker_data'] = self.create_tracker_file() pickle_file = os.path.join(self.OUTPUT_DIR, "patients_data_pickle") fh = open(pickle_file, 'wb') pickle.dump(self.patients, fh) os.chmod(pickle_file, 0o777) self.register_tmp_file(pickle_file) input_json['pickle_data'] = { 'class': 'File', 'location': "juno://" + pickle_file } beagle_version = __version__ run_date = datetime.now().strftime("%Y%m%d_%H:%M:%f") tags = {"beagle_version": beagle_version, "run_date": run_date} app = self.get_pipeline_id() pipeline = Pipeline.objects.get(id=app) pipeline_version = pipeline.version output_directory = pipeline.output_directory self.debug_json = input_json tempo_mpgen_outputs_job_data = { 'app': app, 'inputs': input_json, 'name': "Tempo mpgen %s" % run_date, 'tags': tags, 'output_directory': output_directory } tempo_mpgen_outputs_job = [ (APIRunCreateSerializer(data=tempo_mpgen_outputs_job_data), input_json) ] return tempo_mpgen_outputs_job
def get_pooled_normals(run_ids, preservation_types, bait_set): """ From a list of run_ids, preservation types, and bait sets, get all potential pooled normals """ pooled_normals = FileRepository.all() query = Q(file__file_group=settings.POOLED_NORMAL_FILE_GROUP) run_id_query = build_run_id_query(run_ids) preservation_query = build_preservation_query(preservation_types) q = query & run_id_query & preservation_query pooled_normals = FileRepository.filter(queryset=pooled_normals, q=q) descriptor = get_descriptor(bait_set, pooled_normals) if descriptor: # From returned pooled normals, we found the bait set/recipe we're looking for pooled_normals = FileRepository.filter(queryset=pooled_normals, metadata={'recipe': descriptor}) # sample_name is FROZENPOOLEDNORMAL unless FFPE is in any of the preservation types # in preservation_types preservations_lower_case = set([x.lower() for x in preservation_types]) run_ids_suffix_list = [i for i in run_ids if i] # remove empty or false string values run_ids_suffix = "_".join(run_ids_suffix_list) sample_name = "FROZENPOOLEDNORMAL_" + run_ids_suffix if "ffpe" in preservations_lower_case: sample_name = "FFPEPOOLEDNORMAL_" + run_ids_suffix elif "impact505" in bait_set.lower(): # We didn't find a pooled normal for IMPACT505; return "static" FROZEN or FFPE pool normal preservations_lower_case = set([x.lower() for x in preservation_types]) sample_name = "FROZENPOOLEDNORMAL_IMPACT505_V1" if "ffpe" in preservations_lower_case: sample_name = "FFPEPOOLEDNORMAL_IMPACT505_V1" q = query & Q(metadata__sampleName=sample_name) pooled_normals = FileRepository.filter(queryset=pooled_normals, q=q) if not pooled_normals: LOGGER.error("Could not find IMPACT505 pooled normal to pair %s", sample_name) return None else: return None specimen_type = 'Pooled Normal' sample_files = list() if len(pooled_normals) > 0: for pooled_normal in pooled_normals: sample = dict() sample['id'] = pooled_normal.file.id sample['path'] = pooled_normal.file.path sample['file_name'] = pooled_normal.file.file_name metadata = init_metadata() metadata['sampleId'] = sample_name metadata['sampleName'] = sample_name metadata['requestId'] = sample_name metadata['sequencingCenter'] = "MSKCC" metadata['platform'] = "Illumina" metadata['baitSet'] = descriptor metadata['recipe'] = descriptor metadata['run_id'] = run_ids metadata['preservation'] = preservation_types metadata['libraryId'] = sample_name + "_1" # because rgid depends on flowCellId and barcodeIndex, we will # spoof barcodeIndex so that pairing can work properly; see # build_sample in runner.operator.argos_operator.bin metadata['R'] = get_r_orientation(pooled_normal.file.file_name) metadata['barcodeIndex'] = spoof_barcode(sample['file_name'], metadata['R']) metadata['flowCellId'] = 'PN_FCID' metadata['tumorOrNormal'] = 'Normal' metadata['patientId'] = 'PN_PATIENT_ID' metadata['specimenType'] = specimen_type sample['metadata'] = metadata sample_files.append(sample) pooled_normal = build_sample(sample_files, ignore_sample_formatting=True) return pooled_normal return None
def get_pooled_normals(run_ids, preservation_types, bait_set): """ From a list of run_ids, preservation types, and bait sets, get all potential pooled normals """ pooled_normals = FileRepository.all() query = Q(file__file_group=settings.POOLED_NORMAL_FILE_GROUP) run_id_query = build_run_id_query(run_ids) preservation_query = build_preservation_query(preservation_types) q = query & run_id_query & preservation_query pooled_normals = FileRepository.filter(queryset=pooled_normals, q=q) # 'descriptor' should be the same as bait set, but it's labeled # descriptor because in pooled normals it's called 'recipe' # TODO: change pooled normal field value 'recipe' -> bait_set/baitSet descriptor = get_descriptor(bait_set, pooled_normals) if not descriptor: # i.e., no pooled normal return None pooled_normals = FileRepository.filter(queryset=pooled_normals, metadata={"recipe": descriptor}) sample_files = list() # sample_name is FROZENPOOLEDNORMAL unless FFPE is in any of the preservation types # in preservation_types preservations_lower_case = set([x.lower() for x in preservation_types]) run_ids_suffix_list = [i for i in run_ids if i] # remove empty or false string values run_ids_suffix = "_".join(run_ids_suffix_list) sample_name = "FROZENPOOLEDNORMAL_" + run_ids_suffix if "ffpe" in preservations_lower_case: sample_name = "FFPEPOOLEDNORMAL_" + run_ids_suffix specimen_type = "Pooled Normal" num_of_pooled_normals = len(pooled_normals) if num_of_pooled_normals > 0: for pooled_normal in pooled_normals: sample = dict() sample["id"] = pooled_normal.file.id sample["path"] = pooled_normal.file.path sample["file_name"] = pooled_normal.file.file_name metadata = init_metadata() metadata["sampleId"] = sample_name metadata["sampleName"] = sample_name metadata["requestId"] = sample_name metadata["sequencingCenter"] = "MSKCC" metadata["platform"] = "Illumina" metadata["baitSet"] = descriptor metadata["recipe"] = descriptor metadata["run_id"] = run_ids metadata["preservation"] = preservation_types metadata["libraryId"] = sample_name + "_1" # because rgid depends on flowCellId and barcodeIndex, we will # spoof barcodeIndex so that pairing can work properly; see # build_sample in runner.operator.argos_operator.bin metadata["R"] = get_r_orientation(pooled_normal.file.file_name) metadata["barcodeIndex"] = spoof_barcode(sample["file_name"], metadata["R"]) metadata["flowCellId"] = "PN_FCID" metadata["tumorOrNormal"] = "Normal" metadata["patientId"] = "PN_PATIENT_ID" metadata["specimenType"] = specimen_type sample["metadata"] = metadata sample_files.append(sample) pooled_normal = build_sample(sample_files, ignore_sample_formatting=True) return pooled_normal return None
def get_jobs(self, pairing_override=None): logger.info("Operator JobGroupNotifer ID %s", self.job_group_notifier_id) app = self.get_pipeline_id() pipeline = Pipeline.objects.get(id=app) pipeline_version = pipeline.version output_directory = pipeline.output_directory self.OUTPUT_DIR = output_directory recipe_query = self.build_recipe_query() assay_query = self.build_assay_query() igocomplete_query = Q(metadata__igocomplete=True) missing_fields_query = self.filter_out_missing_fields_query() q = recipe_query & assay_query & igocomplete_query & missing_fields_query files = FileRepository.all() files = FileRepository.filter(queryset=files, filter_redact=True) tempo_files = FileRepository.filter(queryset=files, q=q) tempo_files = FileRepository.filter(queryset=tempo_files, filter_redact=True) self.send_message(""" Querying database for the following recipes: {recipes} Querying database for the following assays/bait sets: {assays} """.format(recipes="\t\n".join(self.get_recipes()), assays="\t\n".join(self.get_assays()))) exclude_query = self.get_exclusions() if exclude_query: tempo_files = tempo_files.exclude(exclude_query) # replace with run operator logic, most recent pairing pre_pairing = self.load_pairing_file( PAIRING_FILE_LOCATION) # pairing.tsv is not in repo if pairing_override: normal_samples = pairing_override["normal_samples"] tumor_samples = pairing_override["tumor_samples"] num_ns = len(normal_samples) num_ts = len(tumor_samples) if num_ns != num_ts: print("Number of tumors and normals not the same; can't pair") else: for i in range(0, num_ns): tumor_id = tumor_samples[i] normal_id = normal_samples[i] pre_pairing[tumor_id] = normal_id patient_ids = set() patient_files = dict() no_patient_samples = list() for entry in tempo_files: patient_id = entry.metadata["patientId"] if patient_id: patient_ids.add(patient_id) if patient_id not in patient_files: patient_files[patient_id] = list() patient_files[patient_id].append(entry) else: no_patient_samples.append(entry) self.patients = dict() self.non_cmo_patients = dict() for patient_id in patient_files: if "C-" in patient_id[:2]: self.patients[patient_id] = patient_obj.Patient( patient_id, patient_files[patient_id], pre_pairing) else: self.non_cmo_patients[patient_id] = patient_obj.Patient( patient_id, patient_files[patient_id]) input_json = dict() # output these strings to file input_json["conflict_data"] = self.create_conflict_samples_txt_file() input_json["unpaired_data"] = self.create_unpaired_txt_file() input_json["mapping_data"] = self.create_mapping_file() input_json["pairing_data"] = self.create_pairing_file() input_json["tracker_data"] = self.create_tracker_file() pickle_file = os.path.join(self.OUTPUT_DIR, "patients_data_pickle") fh = open(pickle_file, "wb") pickle.dump(self.patients, fh) os.chmod(pickle_file, 0o777) self.register_tmp_file(pickle_file) input_json["pickle_data"] = { "class": "File", "location": "juno://" + pickle_file } beagle_version = __version__ run_date = datetime.now().strftime("%Y%m%d_%H:%M:%f") tags = {"beagle_version": beagle_version, "run_date": run_date} self.send_message(""" Writing files to {file_path}. Run Date: {run_date} Beagle Version: {beagle_version} """.format(file_path=self.OUTPUT_DIR, run_date=run_date, beagle_version=beagle_version)) return []
def get_file(fpath): files = FileRepository.all() data = FileRepository.filter(queryset=files, path=fpath) if data: return data[0] return None