Ejemplo n.º 1
0
    def prepare_genes(self):
        curl_ensembl_gtf = Curl()
        curl_ensembl_gtf.remote = self.ensembl_gtf_remote
        curl_ensembl_gtf.output = "{}/genes/{}".format(
            self.outdir, os.path.basename(self.ensembl_gtf_remote))
        curl_ensembl_gtf.jobname = "curl-ensembl-gtf"
        curl_ensembl_gtf.is_intermediate = True
        self.add(curl_ensembl_gtf)

        gunzip_ensembl_gtf = Gunzip()
        gunzip_ensembl_gtf.input = curl_ensembl_gtf.output
        gunzip_ensembl_gtf.output = stripsuffix(curl_ensembl_gtf.output, ".gz")
        gunzip_ensembl_gtf.is_intermediate = True
        self.add(gunzip_ensembl_gtf)

        filt_ensembl_gtf_chrs = FilterGTFChromosomes()
        filt_ensembl_gtf_chrs.input = gunzip_ensembl_gtf.output
        filt_ensembl_gtf_chrs.output = stripsuffix(gunzip_ensembl_gtf.output,
                                                   ".gtf") + ".filtered.gtf"
        self.add(filt_ensembl_gtf_chrs)

        gtf2genepred_ensembl = GTF2GenePred()
        gtf2genepred_ensembl.input = filt_ensembl_gtf_chrs.output
        gtf2genepred_ensembl.output = stripsuffix(filt_ensembl_gtf_chrs.output,
                                                  ".gtf") + ".genepred"
        self.add(gtf2genepred_ensembl)

        filt_genes_ensembl_gtf_genes = FilterGTFGenes()
        filt_genes_ensembl_gtf_genes.input = filt_ensembl_gtf_chrs.output
        filt_genes_ensembl_gtf_genes.output = stripsuffix(
            filt_ensembl_gtf_chrs.output, ".gtf") + ".genes-only.gtf"
        self.add(filt_genes_ensembl_gtf_genes)

        self.reference_data['ensemblVersion'] = self.ensembl_version
        self.reference_data['genesGtf'] = filt_ensembl_gtf_chrs.output
        self.reference_data['genesGenePred'] = gtf2genepred_ensembl.output
        self.reference_data[
            'genesGtfGenesOnly'] = filt_genes_ensembl_gtf_genes.output
Ejemplo n.º 2
0
    def prepare_reference_genome(self):
        genome_unzipped = stripsuffix(
            os.path.basename(self.input_reference_sequence), ".gz")

        gunzip_ref = Gunzip()
        gunzip_ref.input = self.input_reference_sequence
        gunzip_ref.output = "{}/genome/{}".format(self.outdir, genome_unzipped)
        self.add(gunzip_ref)

        copy_ref_to_bwa = Copy(input_file=gunzip_ref.output,
                               output_file="{}/bwa/{}".format(
                                   self.outdir,
                                   os.path.basename(gunzip_ref.output)))
        self.add(copy_ref_to_bwa)

        bwa_index = BwaIndex()
        bwa_index.input_fasta = copy_ref_to_bwa.output
        bwa_index.output = copy_ref_to_bwa.output + ".bwt"
        bwa_index.algorithm = "bwtsw"
        self.add(bwa_index)

        create_dict = PicardCreateSequenceDictionary()
        create_dict.input = gunzip_ref.output
        create_dict.output_dict = gunzip_ref.output.replace(".fasta",
                                                            "") + ".dict"
        self.add(create_dict)

        samtools_faidx = SamtoolsFaidx()
        samtools_faidx.input_fasta = gunzip_ref.output
        samtools_faidx.output = gunzip_ref.output + ".fai"
        self.add(samtools_faidx)

        create_chrsizes = GenerateChrSizes()
        create_chrsizes.input_fai = samtools_faidx.output
        create_chrsizes.output = gunzip_ref.output.replace(
            ".fasta", "") + ".chrsizes.txt"
        self.add(create_chrsizes)

        copy_qdnaseq_bg = Copy(input_file=self.qdnaseq_background,
                               output_file="{}/genome/{}".format(
                                   self.outdir,
                                   os.path.basename(self.qdnaseq_background)))
        self.add(copy_qdnaseq_bg)

        self.reference_data['reference_genome'] = gunzip_ref.output
        self.reference_data['reference_dict'] = create_dict.output_dict
        self.reference_data['chrsizes'] = create_chrsizes.output
        self.reference_data['bwaIndex'] = bwa_index.input_fasta
        self.reference_data['qdnaseq_background'] = copy_qdnaseq_bg.output
Ejemplo n.º 3
0
    def prepare_intervals(self):
        self.reference_data['targets'] = {}
        target_intervals_dir = "{}/target_intervals/".format(
            self.genome_resources)
        input_files = [
            f for f in os.listdir(target_intervals_dir)
            if f.endswith(".interval_list")
        ]

        scan_for_microsatellites = MsiSensorScan()
        scan_for_microsatellites.input_fasta = self.reference_data[
            'reference_genome']
        scan_for_microsatellites.homopolymers_only = True
        scan_for_microsatellites.output = "{}/intervals/msisensor-microsatellites.tsv".format(
            self.outdir)
        self.add(scan_for_microsatellites)

        for f in input_files:
            file_full_path = "{}/target_intervals/{}".format(
                self.genome_resources, f)
            logging.debug("Parsing intervals file {}".format(file_full_path))
            capture_name = stripsuffix(f, ".interval_list")
            self.reference_data['targets'][capture_name] = {}

            copy_file = Copy(input_file=file_full_path,
                             output_file="{}/intervals/targets/{}".format(
                                 self.outdir,
                                 os.path.basename(file_full_path)))
            self.add(copy_file)

            slop_interval_list = SlopIntervalList()
            slop_interval_list.input = copy_file.output
            slop_interval_list.output = stripsuffix(
                copy_file.output,
                ".interval_list") + ".slopped20.interval_list"
            self.add(slop_interval_list)

            interval_list_to_bed = IntervalListToBed()
            interval_list_to_bed.input = slop_interval_list.output
            interval_list_to_bed.output = stripsuffix(
                slop_interval_list.output, ".interval_list") + ".bed"
            self.add(interval_list_to_bed)

            intersect_msi = IntersectMsiSites()
            intersect_msi.input_msi_sites = scan_for_microsatellites.output
            intersect_msi.target_bed = interval_list_to_bed.output
            intersect_msi.output_msi_sites = stripsuffix(
                interval_list_to_bed.output, ".bed") + ".msisites.tsv"
            self.add(intersect_msi)

            self.prepare_msings(stripsuffix(file_full_path, ".interval_list"),
                                capture_name)

            self.reference_data['targets'][capture_name][
                'blacklist-bed'] = None
            blacklist_bed = stripsuffix(file_full_path,
                                        ".interval_list") + ".blacklist.bed"
            if os.path.exists(blacklist_bed):
                blacklist_copy = Copy(
                    input_file=blacklist_bed,
                    output_file="{}/intervals/targets/{}".format(
                        self.outdir,
                        os.path.basename(blacklist_bed),
                    ))
                self.add(blacklist_copy)
                self.reference_data['targets'][capture_name][
                    'blacklist-bed'] = blacklist_copy.output

            purecn_targets_file = stripsuffix(file_full_path,
                                              ".interval_list") + ".purecn.txt"
            if os.path.exists(purecn_targets_file):
                copy_purecn_targets = Copy(
                    input_file=purecn_targets_file,
                    output_file="{}/intervals/targets/{}".format(
                        self.outdir, os.path.basename(purecn_targets_file)))
                self.add(copy_purecn_targets)
                self.reference_data['targets'][capture_name][
                    'purecn_targets'] = copy_purecn_targets.output
            else:
                self.reference_data['targets'][capture_name][
                    'purecn_targets'] = None

            self.reference_data['targets'][capture_name][
                'targets-interval_list'] = copy_file.output
            self.reference_data['targets'][capture_name][
                'targets-interval_list-slopped20'] = slop_interval_list.output
            self.reference_data['targets'][capture_name][
                'targets-bed-slopped20'] = interval_list_to_bed.output
            self.reference_data['targets'][capture_name][
                'msisites'] = intersect_msi.output_msi_sites

        # Find all .cnn files and copy + register them for use in cnv kit:
        for f in [
                f for f in os.listdir(target_intervals_dir)
                if (f.endswith(".cnn") or "cnvkit-fix" in f)
        ]:
            self.prepare_cnvkit(f)