Example #1
0
    def parse_manifest_into_cohort(filename):

        logging.info(f"Reading manifest file: '{filename}'")

        if filename.startswith("sb://"):
            project_id, file_name = os.path.split(filename[5:])
            sbfile = FindOrCopyFilesByName(
                f"CopyManifest",
                names=[file_name],
                from_project=SBApi().projects.get(project_id),
                to_project=Context().project,
            ).copied_files[0]
            filename = tempfile.gettempdir() + "/manifest.txt"
            sbfile.download(path=filename)

        cohort = Cohort(manifest_file=filename)

        num_entries = 0
        with open(str(filename), "r") as f:
            for line_no, line in enumerate(f.readlines()):

                if line_no == 0:  # skip header
                    continue

                if line.strip().startswith("#"):
                    continue

                patient_id, sample_id, read_group, fq1, fq2 = line.strip(
                ).split("\t")

                patient = cohort.get_patient_by_id(patient_id)
                if not patient:
                    patient = Patient(patient_id)
                    cohort.add_patient(patient)

                sample = patient.get_sample_by_id(sample_id)
                if not sample:
                    sample = Sample(sample_id)
                    patient.add_sample(sample)

                lane = Lane(read_group=read_group, fq1=fq1, fq2=fq2)
                sample.add_lane(lane)

                num_entries += 1

        logging.info("  %d manifest entries read." % num_entries)

        return cohort
Example #2
0
    def stage_reference_file(self, ref_name, file_path):
        ref_project_id, file_name = os.path.split(file_path)
        ref_project = SBApi().projects.get(id=ref_project_id)

        return FindOrCopyFilesByName(
            f"CopyRef-{ref_name}",
            names=[file_name],
            from_project=ref_project,
            to_project=self.project,
        ).copied_files[0]
Example #3
0
    def stage_reference_files(self):
        ref_files = self.config.reference_files["set1"]

        f2ref, sources = {}, {}
        for ref_name, file_path in ref_files.items():

            project = "/".join(file_path.split("/")[0:2])
            path = "/".join(file_path.split("/")[2:-1])
            name = file_path.split("/")[-1]

            f2ref[name] = ref_name
            key = project + "|" + path
            src = {"ref_name": ref_name, "filename": name}

            if key in sources:
                sources[key].append(src)
            else:
                sources[key] = [src]

        for loc, items in sources.items():
            project, path = loc.split("|")

            copied_files = FindOrCopyFilesByName(
                name_=f"CopyFiles-" + loc,
                names=[i["filename"] for i in items],
                from_project=SBApi().projects.get(id=project),
                from_path=path if path else None,
                to_project=self.project,
                to_path="reference_files",
            ).copied_files

            for ref_name, file in zip(
                [i["ref_name"] for i in items], copied_files
            ):
                logging.info(
                    "Reference staged: %s -> %s"
                    % (f2ref[file.name], file.name)
                )
                self.refs[f2ref[file.name]] = file
Example #4
0
    def stage_input_files_in_bulk(cohort):
        "Copy all input files to execution project in bulk to save API calls"

        ctx = Context()

        fastq_project = SBApi().projects.get(id=ctx.config.fastq_project)

        files_to_stage = [
            f for s in cohort.samples for l in s.lanes for f in [l.fq1, l.fq2]
        ]

        staged_files = FindOrCopyFilesByName(
            names=files_to_stage,
            from_project=fastq_project,
            to_project=ctx.project).copied_files

        staged_files = {f.name: f for f in staged_files}

        for sample in cohort.samples:
            for lane in sample.lanes:
                lane.fq1 = staged_files[lane.fq1]
                lane.fq2 = staged_files[lane.fq2]