コード例 #1
0
    def prepare_reference_genome(self):
        genome_unzipped = stripsuffix(
            os.path.basename(self.input_reference_sequence), ".gz")

        gunzip_ref = Gunzip()
        gunzip_ref.input = self.input_reference_sequence
        gunzip_ref.output = "{}/genome/{}".format(self.outdir, genome_unzipped)
        self.add(gunzip_ref)

        copy_ref_to_bwa = Copy(input_file=gunzip_ref.output,
                               output_file="{}/bwa/{}".format(
                                   self.outdir,
                                   os.path.basename(gunzip_ref.output)))
        self.add(copy_ref_to_bwa)

        bwa_index = BwaIndex()
        bwa_index.input_fasta = copy_ref_to_bwa.output
        bwa_index.output = copy_ref_to_bwa.output + ".bwt"
        bwa_index.algorithm = "bwtsw"
        self.add(bwa_index)

        create_dict = PicardCreateSequenceDictionary()
        create_dict.input = gunzip_ref.output
        create_dict.output_dict = gunzip_ref.output.replace(".fasta",
                                                            "") + ".dict"
        self.add(create_dict)

        samtools_faidx = SamtoolsFaidx()
        samtools_faidx.input_fasta = gunzip_ref.output
        samtools_faidx.output = gunzip_ref.output + ".fai"
        self.add(samtools_faidx)

        create_chrsizes = GenerateChrSizes()
        create_chrsizes.input_fai = samtools_faidx.output
        create_chrsizes.output = gunzip_ref.output.replace(
            ".fasta", "") + ".chrsizes.txt"
        self.add(create_chrsizes)

        copy_qdnaseq_bg = Copy(input_file=self.qdnaseq_background,
                               output_file="{}/genome/{}".format(
                                   self.outdir,
                                   os.path.basename(self.qdnaseq_background)))
        self.add(copy_qdnaseq_bg)

        self.reference_data['reference_genome'] = gunzip_ref.output
        self.reference_data['reference_dict'] = create_dict.output_dict
        self.reference_data['chrsizes'] = create_chrsizes.output
        self.reference_data['bwaIndex'] = bwa_index.input_fasta
        self.reference_data['qdnaseq_background'] = copy_qdnaseq_bg.output
コード例 #2
0
    def prepare_sveffect_regions(self):
        for regions_name in ["ar_regions", "ts_regions", "fusion_regions"]:
            file_full_path = "{}/{}.bed".format(
                self.genome_resources,
                regions_name,
            )

            copy_regions = Copy(input_file=file_full_path,
                                output_file="{}/intervals/{}".format(
                                    self.outdir,
                                    os.path.basename(file_full_path),
                                ))

            self.reference_data[regions_name] = copy_regions.output

            self.add(copy_regions)
コード例 #3
0
    def prepare_msings(self, filename_base, capture_name):
        """
        Setup the copying of the relevant msings parameter files, for the given base
        filename.

        :param filename_base: String suffix for all potential msings parameter files.
        :param capture_name: String capture name. Note: this information is also contained
            in the filename_base parameter; should be refactored.
        """

        for msings_extn in ["baseline", "bed", "msi_intervals"]:
            msings_ref_file = filename_base + ".msings." + msings_extn
            if os.path.exists(msings_ref_file):
                copy_msings_ref = Copy(
                    input_file=msings_ref_file,
                    output_file="{}/intervals/targets/{}".format(
                        self.outdir, os.path.basename(msings_ref_file)))
                self.add(copy_msings_ref)
                self.reference_data['targets'][capture_name][
                    'msings-' + msings_extn] = copy_msings_ref.output
            else:
                self.reference_data['targets'][capture_name][
                    'msings-' + msings_extn] = None
コード例 #4
0
    def prepare_cnvkit(self, cnv_kit_ref_filename):
        """

        :param cnv_kit_ref_filename: String name of a cnvkit reference file (either *.cnn
        or *.cnvkitref.txt), to be registered in self.ref_data.
        """

        file_full_path = "{}/target_intervals/{}".format(
            self.genome_resources, cnv_kit_ref_filename)

        # Extract the capture+library+sampletype strings:
        capture_library_sampletype = cnv_kit_ref_filename.split(".")[:3]

        copy_cnvkit_ref = Copy(input_file=file_full_path,
                               output_file="{}/intervals/targets/{}".format(
                                   self.outdir,
                                   os.path.basename(file_full_path)))
        self.add(copy_cnvkit_ref)

        capture_name = capture_library_sampletype[0]
        library_kit_name = capture_library_sampletype[1]
        sample_type = capture_library_sampletype[2]

        ref_type = "cnvkit-fix"
        if cnv_kit_ref_filename.endswith(("cnn")):
            ref_type = "cnvkit-ref"

        # FIXME: Ugly; refactor. This registers the cnvkit reference file copy in the autoseq genome dictionary:
        if ref_type not in self.reference_data['targets'][capture_name]:
            self.reference_data['targets'][capture_name][ref_type] = {}
        if library_kit_name not in self.reference_data['targets'][
                capture_name][ref_type]:
            self.reference_data['targets'][capture_name][ref_type][
                library_kit_name] = {}
        self.reference_data['targets'][capture_name][ref_type][library_kit_name][sample_type] = \
            copy_cnvkit_ref.output
コード例 #5
0
    def prepare_intervals(self):
        self.reference_data['targets'] = {}
        target_intervals_dir = "{}/target_intervals/".format(
            self.genome_resources)
        input_files = [
            f for f in os.listdir(target_intervals_dir)
            if f.endswith(".interval_list")
        ]

        scan_for_microsatellites = MsiSensorScan()
        scan_for_microsatellites.input_fasta = self.reference_data[
            'reference_genome']
        scan_for_microsatellites.homopolymers_only = True
        scan_for_microsatellites.output = "{}/intervals/msisensor-microsatellites.tsv".format(
            self.outdir)
        self.add(scan_for_microsatellites)

        for f in input_files:
            file_full_path = "{}/target_intervals/{}".format(
                self.genome_resources, f)
            logging.debug("Parsing intervals file {}".format(file_full_path))
            capture_name = stripsuffix(f, ".interval_list")
            self.reference_data['targets'][capture_name] = {}

            copy_file = Copy(input_file=file_full_path,
                             output_file="{}/intervals/targets/{}".format(
                                 self.outdir,
                                 os.path.basename(file_full_path)))
            self.add(copy_file)

            slop_interval_list = SlopIntervalList()
            slop_interval_list.input = copy_file.output
            slop_interval_list.output = stripsuffix(
                copy_file.output,
                ".interval_list") + ".slopped20.interval_list"
            self.add(slop_interval_list)

            interval_list_to_bed = IntervalListToBed()
            interval_list_to_bed.input = slop_interval_list.output
            interval_list_to_bed.output = stripsuffix(
                slop_interval_list.output, ".interval_list") + ".bed"
            self.add(interval_list_to_bed)

            intersect_msi = IntersectMsiSites()
            intersect_msi.input_msi_sites = scan_for_microsatellites.output
            intersect_msi.target_bed = interval_list_to_bed.output
            intersect_msi.output_msi_sites = stripsuffix(
                interval_list_to_bed.output, ".bed") + ".msisites.tsv"
            self.add(intersect_msi)

            self.prepare_msings(stripsuffix(file_full_path, ".interval_list"),
                                capture_name)

            self.reference_data['targets'][capture_name][
                'blacklist-bed'] = None
            blacklist_bed = stripsuffix(file_full_path,
                                        ".interval_list") + ".blacklist.bed"
            if os.path.exists(blacklist_bed):
                blacklist_copy = Copy(
                    input_file=blacklist_bed,
                    output_file="{}/intervals/targets/{}".format(
                        self.outdir,
                        os.path.basename(blacklist_bed),
                    ))
                self.add(blacklist_copy)
                self.reference_data['targets'][capture_name][
                    'blacklist-bed'] = blacklist_copy.output

            purecn_targets_file = stripsuffix(file_full_path,
                                              ".interval_list") + ".purecn.txt"
            if os.path.exists(purecn_targets_file):
                copy_purecn_targets = Copy(
                    input_file=purecn_targets_file,
                    output_file="{}/intervals/targets/{}".format(
                        self.outdir, os.path.basename(purecn_targets_file)))
                self.add(copy_purecn_targets)
                self.reference_data['targets'][capture_name][
                    'purecn_targets'] = copy_purecn_targets.output
            else:
                self.reference_data['targets'][capture_name][
                    'purecn_targets'] = None

            self.reference_data['targets'][capture_name][
                'targets-interval_list'] = copy_file.output
            self.reference_data['targets'][capture_name][
                'targets-interval_list-slopped20'] = slop_interval_list.output
            self.reference_data['targets'][capture_name][
                'targets-bed-slopped20'] = interval_list_to_bed.output
            self.reference_data['targets'][capture_name][
                'msisites'] = intersect_msi.output_msi_sites

        # Find all .cnn files and copy + register them for use in cnv kit:
        for f in [
                f for f in os.listdir(target_intervals_dir)
                if (f.endswith(".cnn") or "cnvkit-fix" in f)
        ]:
            self.prepare_cnvkit(f)
コード例 #6
0
    def prepare_variants(self):
        curl_dbsnp = CurlSplitAndLeftAlign()
        curl_dbsnp.input_reference_sequence = self.reference_data[
            'reference_genome']
        curl_dbsnp.input_reference_sequence_fai = self.reference_data[
            'reference_genome'] + ".fai"
        curl_dbsnp.remote = self.dbsnp_remote
        curl_dbsnp.output = "{}/variants/{}".format(
            self.outdir, os.path.basename(self.dbsnp_remote))
        curl_dbsnp.is_intermediate = True
        self.add(curl_dbsnp)

        filter_dbsnp = VcfFilter()
        filter_dbsnp.input = curl_dbsnp.output
        filter_dbsnp.filter = "\"! ( SAO = 3 | SAO = 2 )\""
        filter_dbsnp.output = "{}/variants/dbsnp142-germline-only.vcf.gz".format(
            self.outdir)
        self.add(filter_dbsnp)

        curl_cosmic = CurlSplitAndLeftAlign()
        curl_cosmic.input_reference_sequence = self.reference_data[
            'reference_genome']
        curl_cosmic.input_reference_sequence_fai = self.reference_data[
            'reference_genome'] + ".fai"
        curl_cosmic.remote = "file://" + self.cosmic_vcf
        curl_cosmic.output = "{}/variants/{}".format(
            self.outdir, os.path.basename(self.cosmic_vcf))
        self.add(curl_cosmic)

        curl_clinvar = CurlSplitAndLeftAlign()
        curl_clinvar.input_reference_sequence = self.reference_data[
            'reference_genome']
        curl_clinvar.input_reference_sequence_fai = self.reference_data[
            'reference_genome'] + ".fai"
        curl_clinvar.remote = self.clinvar_remote
        curl_clinvar.output = "{}/variants/{}".format(
            self.outdir, os.path.basename(self.clinvar_remote))
        self.add(curl_clinvar)

        curl_exac = CurlSplitAndLeftAlign()
        curl_exac.input_reference_sequence = self.reference_data[
            'reference_genome']
        curl_exac.input_reference_sequence_fai = self.reference_data[
            'reference_genome'] + ".fai"
        curl_exac.remote = self.exac_remote
        curl_exac.output = "{}/variants/{}".format(
            self.outdir, os.path.basename(self.exac_remote))
        self.add(curl_exac)

        curl_icgc = CurlSplitAndLeftAlign()
        curl_icgc.input_reference_sequence = self.reference_data[
            'reference_genome']
        curl_icgc.input_reference_sequence_fai = self.reference_data[
            'reference_genome'] + ".fai"
        curl_icgc.remote = self.icgc_somatic_remote
        curl_icgc.output = "{}/variants/{}".format(
            self.outdir,
            "icgc_release_20_simple_somatic_mutation.aggregated.vcf.gz")
        self.add(curl_icgc)

        curl_swegene = CurlSplitAndLeftAlign()
        curl_swegene.input_reference_sequence = self.reference_data[
            'reference_genome']
        curl_swegene.input_reference_sequence_fai = self.reference_data[
            'reference_genome'] + ".fai"
        curl_swegene.remote = "file://" + self.swegene_common_vcf
        curl_swegene.output = "{}/variants/{}".format(
            self.outdir, os.path.basename(self.swegene_common_vcf))
        self.add(curl_swegene)

        copy_thousand_genome = Copy(input_file=self.thousand_genome_vcf,
                                    output_file="{}/variants/{}".format(
                                        self.outdir,
                                        os.path.basename(
                                            self.thousand_genome_vcf)))
        self.add(copy_thousand_genome)

        copy_mills_and_1000g = Copy(
            input_file=self.mills_and_1000g_gold_standard,
            output_file="{}/variants/{}".format(
                self.outdir,
                os.path.basename(self.mills_and_1000g_gold_standard)))
        self.add(copy_mills_and_1000g)

        copy_brca_exchange = Copy(input_file=self.brca_exchange,
                                  output_file="{}/variants/{}".format(
                                      self.outdir,
                                      os.path.basename(self.brca_exchange)))
        self.add(copy_brca_exchange)

        copy_oncokb = Copy(input_file=self.oncokb,
                           output_file="{}/variants/{}".format(
                               self.outdir, os.path.basename(self.oncokb)))
        self.add(copy_oncokb)

        self.reference_data['dbSNP'] = filter_dbsnp.output
        self.reference_data['cosmic'] = curl_cosmic.output
        self.reference_data['exac'] = curl_exac.output
        self.reference_data['clinvar'] = curl_clinvar.output
        self.reference_data['icgc'] = curl_icgc.output
        self.reference_data['swegene_common'] = curl_swegene.output
        self.reference_data['1KG'] = copy_thousand_genome.output
        self.reference_data[
            'Mills_and_1KG_gold_standard'] = copy_mills_and_1000g.output
        self.reference_data['brca_exchange'] = copy_brca_exchange.output
        self.reference_data['oncokb'] = copy_oncokb.output