def generate_sample_pairing_and_mapping_files(run_ids): sample_pairing = "" sample_mapping = "" runs = Run.objects.filter(id__in=run_ids) request_id_set = set() files = list() if runs: pipeline = runs[0].app for r in runs: request_id_set.add(r.tags["requestId"]) inp_port = Port.objects.filter(run_id=r.id, name="pair").first() tumor_sample_name = inp_port.db_value[0]["ID"] for p in inp_port.db_value[0]["R1"]: sample_mapping += "\t".join([tumor_sample_name, FileProcessor.get_file_path(p["location"])]) + "\n" files.append(FileProcessor.get_file_path(p["location"])) for p in inp_port.db_value[0]["R2"]: sample_mapping += "\t".join([tumor_sample_name, FileProcessor.get_file_path(p["location"])]) + "\n" files.append(FileProcessor.get_file_path(p["location"])) for p in inp_port.db_value[0]["zR1"]: sample_mapping += "\t".join([tumor_sample_name, FileProcessor.get_file_path(p["location"])]) + "\n" files.append(FileProcessor.get_file_path(p["location"])) for p in inp_port.db_value[0]["zR2"]: sample_mapping += "\t".join([tumor_sample_name, FileProcessor.get_file_path(p["location"])]) + "\n" files.append(FileProcessor.get_file_path(p["location"])) normal_sample_name = inp_port.db_value[1]["ID"] for p in inp_port.db_value[1]["R1"]: sample_mapping += "\t".join([normal_sample_name, FileProcessor.get_file_path(p["location"])]) + "\n" files.append(FileProcessor.get_file_path(p["location"])) for p in inp_port.db_value[1]["R2"]: sample_mapping += "\t".join([normal_sample_name, FileProcessor.get_file_path(p["location"])]) + "\n" files.append(FileProcessor.get_file_path(p["location"])) for p in inp_port.db_value[1]["zR1"]: sample_mapping += "\t".join([normal_sample_name, FileProcessor.get_file_path(p["location"])]) + "\n" files.append(FileProcessor.get_file_path(p["location"])) for p in inp_port.db_value[1]["zR2"]: sample_mapping += "\t".join([normal_sample_name, FileProcessor.get_file_path(p["location"])]) + "\n" files.append(FileProcessor.get_file_path(p["location"])) for p in inp_port.db_value[1]["bam"]: sample_mapping += "\t".join([normal_sample_name, FileProcessor.get_file_path(p["location"])]) + "\n" files.append(FileProcessor.get_file_path(p["location"])) sample_pairing += "\t".join([normal_sample_name, tumor_sample_name]) + "\n" if runs: data_clinical = generate_sample_data_content( files, pipeline_name=pipeline.name, pipeline_github=pipeline.github, pipeline_version=pipeline.version ) return sample_mapping, sample_pairing, data_clinical
def create_data_clinical_file(run_id_list): files = list() pipeline_names = set() pipeline_githubs = set() pipeline_versions = set() for run_id in run_id_list: argos_run = Run.objects.get(id=run_id) pipeline = argos_run.app pipeline_names.add(pipeline.name) pipeline_githubs.add(pipeline.github) pipeline_versions.add(pipeline.version) files = files + get_files_from_run(argos_run) data_clinical_content = generate_sample_data_content( files, pipeline_name=",".join(pipeline_names), pipeline_github=",".join(pipeline_githubs), pipeline_version=",".join(pipeline_versions), ) data_clinical_content = data_clinical_content.strip() return {"class": "File", "basename": "sample_data_clinical.txt", "contents": data_clinical_content}
def get_jobs(self): files = FileRepository.filter(queryset=self.files, metadata={ "requestId": self.request_id, "igocomplete": True }) argos_jobs = list() cnt_tumors = FileRepository.filter(queryset=self.files, metadata={ "requestId": self.request_id, "tumorOrNormal": "Tumor", "igocomplete": True }).count() if cnt_tumors == 0: cant_do = CantDoEvent(self.job_group_notifier_id).to_dict() send_notification.delay(cant_do) all_normals_event = SetLabelEvent(self.job_group_notifier_id, "all_normals").to_dict() send_notification.delay(all_normals_event) return argos_jobs data = list() for f in files: sample = dict() sample["id"] = f.file.id sample["path"] = f.file.path sample["file_name"] = f.file.file_name sample["metadata"] = f.metadata data.append(sample) files = list() samples = list() # group by igoId igo_id_group = dict() for sample in data: igo_id = sample["metadata"]["sampleId"] if igo_id not in igo_id_group: igo_id_group[igo_id] = list() igo_id_group[igo_id].append(sample) for igo_id in igo_id_group: samples.append(build_sample(igo_id_group[igo_id])) argos_inputs, error_samples = construct_argos_jobs(samples) number_of_inputs = len(argos_inputs) sample_pairing = "" sample_mapping = "" pipeline = self.get_pipeline_id() try: pipeline_obj = Pipeline.objects.get(id=pipeline) except Pipeline.DoesNotExist: pass for i, job in enumerate(argos_inputs): tumor_sample_name = job["pair"][0]["ID"] for p in job["pair"][0]["R1"]: filepath = FileProcessor.parse_path_from_uri(p["location"]) if filepath not in files: sample_mapping += "\t".join([tumor_sample_name, filepath ]) + "\n" files.append(filepath) for p in job["pair"][0]["R2"]: filepath = FileProcessor.parse_path_from_uri(p["location"]) if filepath not in files: sample_mapping += "\t".join([tumor_sample_name, filepath ]) + "\n" files.append(filepath) for p in job["pair"][0]["zR1"]: filepath = FileProcessor.parse_path_from_uri(p["location"]) if filepath not in files: sample_mapping += "\t".join([tumor_sample_name, filepath ]) + "\n" files.append(filepath) for p in job["pair"][0]["zR2"]: filepath = FileProcessor.parse_path_from_uri(p["location"]) if filepath not in files: sample_mapping += "\t".join([tumor_sample_name, filepath ]) + "\n" files.append(filepath) normal_sample_name = job["pair"][1]["ID"] for p in job["pair"][1]["R1"]: filepath = FileProcessor.parse_path_from_uri(p["location"]) if filepath not in files: sample_mapping += "\t".join([normal_sample_name, filepath ]) + "\n" files.append(filepath) for p in job["pair"][1]["R2"]: filepath = FileProcessor.parse_path_from_uri(p["location"]) if filepath not in files: sample_mapping += "\t".join([normal_sample_name, filepath ]) + "\n" files.append(filepath) for p in job["pair"][1]["zR1"]: filepath = FileProcessor.parse_path_from_uri(p["location"]) if filepath not in files: sample_mapping += "\t".join([normal_sample_name, filepath ]) + "\n" files.append(filepath) for p in job["pair"][1]["zR2"]: filepath = FileProcessor.parse_path_from_uri(p["location"]) if filepath not in files: sample_mapping += "\t".join([normal_sample_name, filepath ]) + "\n" files.append(filepath) for p in job["pair"][1]["bam"]: filepath = FileProcessor.parse_path_from_uri(p["location"]) if filepath not in files: sample_mapping += "\t".join([normal_sample_name, filepath ]) + "\n" files.append(filepath) name = "ARGOS %s, %i of %i" % (self.request_id, i + 1, number_of_inputs) assay = job["assay"] pi = job["pi"] pi_email = job["pi_email"] sample_pairing += "\t".join( [normal_sample_name, tumor_sample_name]) + "\n" tags = { "requestId": self.request_id, "sampleNameTumor": tumor_sample_name, "sampleNameNormal": normal_sample_name, "labHeadName": pi, "labHeadEmail": pi_email, } argos_jobs.append( RunCreator(app=pipeline, inputs=job, name=name, tags=tags)) operator_run_summary = UploadAttachmentEvent( self.job_group_notifier_id, "sample_pairing.txt", sample_pairing).to_dict() send_notification.delay(operator_run_summary) mapping_file_event = UploadAttachmentEvent(self.job_group_notifier_id, "sample_mapping.txt", sample_mapping).to_dict() send_notification.delay(mapping_file_event) data_clinical = generate_sample_data_content( files, pipeline_name=pipeline_obj.name, pipeline_github=pipeline_obj.github, pipeline_version=pipeline_obj.version, ) sample_data_clinical_event = UploadAttachmentEvent( self.job_group_notifier_id, "sample_data_clinical.txt", data_clinical).to_dict() send_notification.delay(sample_data_clinical_event) self.evaluate_sample_errors(error_samples) self.summarize_pairing_info(argos_inputs) return argos_jobs
def get_jobs(self): argos_jobs = list() if self.request_id: files = FileRepository.filter(queryset=self.files, metadata={ 'requestId': self.request_id, 'igocomplete': True }, filter_redact=True) cnt_tumors = FileRepository.filter(queryset=self.files, metadata={ 'requestId': self.request_id, 'tumorOrNormal': 'Tumor', 'igocomplete': True }, filter_redact=True).count() elif self.pairing: files, cnt_tumors = self.get_files_for_pairs() if cnt_tumors == 0: cant_do = CantDoEvent(self.job_group_notifier_id).to_dict() send_notification.delay(cant_do) all_normals_event = SetLabelEvent(self.job_group_notifier_id, 'all_normals').to_dict() send_notification.delay(all_normals_event) return argos_jobs data = list() for f in files: sample = dict() sample['id'] = f.file.id sample['path'] = f.file.path sample['file_name'] = f.file.file_name sample['metadata'] = f.metadata data.append(sample) files = list() samples = list() # group by igoId igo_id_group = dict() for sample in data: igo_id = sample['metadata']['sampleId'] if igo_id not in igo_id_group: igo_id_group[igo_id] = list() igo_id_group[igo_id].append(sample) for igo_id in igo_id_group: samples.append(build_sample(igo_id_group[igo_id])) argos_inputs, error_samples = construct_argos_jobs( samples, self.pairing) number_of_inputs = len(argos_inputs) sample_pairing = "" sample_mapping = "" pipeline = self.get_pipeline_id() try: pipeline_obj = Pipeline.objects.get(id=pipeline) except Pipeline.DoesNotExist: pass check_for_duplicates = list() for i, job in enumerate(argos_inputs): tumor_sample_name = job['pair'][0]['ID'] for p in job['pair'][0]['R1']: filepath = FileProcessor.parse_path_from_uri(p['location']) file_str = "\t".join([tumor_sample_name, filepath]) + "\n" if file_str not in check_for_duplicates: check_for_duplicates.append(file_str) sample_mapping += file_str if filepath not in files: files.append(filepath) for p in job['pair'][0]['R2']: filepath = FileProcessor.parse_path_from_uri(p['location']) file_str = "\t".join([tumor_sample_name, filepath]) + "\n" if file_str not in check_for_duplicates: check_for_duplicates.append(file_str) sample_mapping += file_str if filepath not in files: files.append(filepath) for p in job['pair'][0]['zR1']: filepath = FileProcessor.parse_path_from_uri(p['location']) file_str = "\t".join([tumor_sample_name, filepath]) + "\n" if file_str not in check_for_duplicates: check_for_duplicates.append(file_str) sample_mapping += file_str if filepath not in files: files.append(filepath) for p in job['pair'][0]['zR2']: filepath = FileProcessor.parse_path_from_uri(p['location']) file_str = "\t".join([tumor_sample_name, filepath]) + "\n" if file_str not in check_for_duplicates: check_for_duplicates.append(file_str) sample_mapping += file_str if filepath not in files: files.append(filepath) normal_sample_name = job['pair'][1]['ID'] for p in job['pair'][1]['R1']: filepath = FileProcessor.parse_path_from_uri(p['location']) file_str = "\t".join([normal_sample_name, filepath]) + "\n" if file_str not in check_for_duplicates: check_for_duplicates.append(file_str) sample_mapping += file_str if filepath not in files: files.append(filepath) for p in job['pair'][1]['R2']: filepath = FileProcessor.parse_path_from_uri(p['location']) file_str = "\t".join([normal_sample_name, filepath]) + "\n" if file_str not in check_for_duplicates: check_for_duplicates.append(file_str) sample_mapping += file_str if filepath not in files: sample_mapping += "\t".join([normal_sample_name, filepath ]) + "\n" files.append(filepath) for p in job['pair'][1]['zR1']: filepath = FileProcessor.parse_path_from_uri(p['location']) file_str = "\t".join([normal_sample_name, filepath]) + "\n" if file_str not in check_for_duplicates: check_for_duplicates.append(file_str) sample_mapping += file_str if filepath not in files: files.append(filepath) for p in job['pair'][1]['zR2']: filepath = FileProcessor.parse_path_from_uri(p['location']) file_str = "\t".join([normal_sample_name, filepath]) + "\n" if file_str not in check_for_duplicates: check_for_duplicates.append(file_str) sample_mapping += file_str if filepath not in files: files.append(filepath) for p in job['pair'][1]['bam']: filepath = FileProcessor.parse_path_from_uri(p['location']) file_str = "\t".join([normal_sample_name, filepath]) + "\n" if file_str not in check_for_duplicates: check_for_duplicates.append(file_str) sample_mapping += file_str if filepath not in files: files.append(filepath) name = "ARGOS %s, %i of %i" % (self.request_id, i + 1, number_of_inputs) assay = job['assay'] pi = job['pi'] pi_email = job['pi_email'] sample_pairing += "\t".join( [normal_sample_name, tumor_sample_name]) + "\n" argos_jobs.append((APIRunCreateSerializer( data={ 'app': pipeline, 'inputs': argos_inputs, 'name': name, 'tags': { 'requestId': self.request_id, 'sampleNameTumor': tumor_sample_name, 'sampleNameNormal': normal_sample_name, 'labHeadName': pi, 'labHeadEmail': pi_email } }), job)) operator_run_summary = UploadAttachmentEvent( self.job_group_notifier_id, 'sample_pairing.txt', sample_pairing).to_dict() send_notification.delay(operator_run_summary) mapping_file_event = UploadAttachmentEvent(self.job_group_notifier_id, 'sample_mapping.txt', sample_mapping).to_dict() send_notification.delay(mapping_file_event) data_clinical = generate_sample_data_content( files, pipeline_name=pipeline_obj.name, pipeline_github=pipeline_obj.github, pipeline_version=pipeline_obj.version) sample_data_clinical_event = UploadAttachmentEvent( self.job_group_notifier_id, 'sample_data_clinical.txt', data_clinical).to_dict() send_notification.delay(sample_data_clinical_event) self.evaluate_sample_errors(error_samples) self.summarize_pairing_info(argos_inputs) return argos_jobs