def parse_manifest_into_cohort(filename): logging.info(f"Reading manifest file: '{filename}'") if filename.startswith("sb://"): project_id, file_name = os.path.split(filename[5:]) sbfile = FindOrCopyFilesByName( f"CopyManifest", names=[file_name], from_project=SBApi().projects.get(project_id), to_project=Context().project, ).copied_files[0] filename = tempfile.gettempdir() + "/manifest.txt" sbfile.download(path=filename) cohort = Cohort(manifest_file=filename) num_entries = 0 with open(str(filename), "r") as f: for line_no, line in enumerate(f.readlines()): if line_no == 0: # skip header continue if line.strip().startswith("#"): continue patient_id, sample_id, read_group, fq1, fq2 = line.strip( ).split("\t") patient = cohort.get_patient_by_id(patient_id) if not patient: patient = Patient(patient_id) cohort.add_patient(patient) sample = patient.get_sample_by_id(sample_id) if not sample: sample = Sample(sample_id) patient.add_sample(sample) lane = Lane(read_group=read_group, fq1=fq1, fq2=fq2) sample.add_lane(lane) num_entries += 1 logging.info(" %d manifest entries read." % num_entries) return cohort
def stage_reference_file(self, ref_name, file_path): ref_project_id, file_name = os.path.split(file_path) ref_project = SBApi().projects.get(id=ref_project_id) return FindOrCopyFilesByName( f"CopyRef-{ref_name}", names=[file_name], from_project=ref_project, to_project=self.project, ).copied_files[0]
def stage_reference_files(self): ref_files = self.config.reference_files["set1"] f2ref, sources = {}, {} for ref_name, file_path in ref_files.items(): project = "/".join(file_path.split("/")[0:2]) path = "/".join(file_path.split("/")[2:-1]) name = file_path.split("/")[-1] f2ref[name] = ref_name key = project + "|" + path src = {"ref_name": ref_name, "filename": name} if key in sources: sources[key].append(src) else: sources[key] = [src] for loc, items in sources.items(): project, path = loc.split("|") copied_files = FindOrCopyFilesByName( name_=f"CopyFiles-" + loc, names=[i["filename"] for i in items], from_project=SBApi().projects.get(id=project), from_path=path if path else None, to_project=self.project, to_path="reference_files", ).copied_files for ref_name, file in zip( [i["ref_name"] for i in items], copied_files ): logging.info( "Reference staged: %s -> %s" % (f2ref[file.name], file.name) ) self.refs[f2ref[file.name]] = file
def stage_input_files_in_bulk(cohort): "Copy all input files to execution project in bulk to save API calls" ctx = Context() fastq_project = SBApi().projects.get(id=ctx.config.fastq_project) files_to_stage = [ f for s in cohort.samples for l in s.lanes for f in [l.fq1, l.fq2] ] staged_files = FindOrCopyFilesByName( names=files_to_stage, from_project=fastq_project, to_project=ctx.project).copied_files staged_files = {f.name: f for f in staged_files} for sample in cohort.samples: for lane in sample.lanes: lane.fq1 = staged_files[lane.fq1] lane.fq2 = staged_files[lane.fq2]