Ejemplos de log_command en Python, ejemplos de utils.log_command.log_command en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: variant_calling.py Proyecto: sahinsrhn/GenomicPipeline

 def mutect_select_variant_other(self, mutect_output):
     indel_output = "OTHER_" + mutect_output.split("/")[-1]
     command = self.get_paths.gatk4_path + " SelectVariants -R " + self.ref_dir + " -V " + mutect_output + \
               " --select-type-to-exclude INDEL --select-type-to-exclude SNP -O " + indel_output
     print(command)
     log_command(command, "Mutect2", self.threads, "Select OTHER Variants")
     print(indel_output)

Ejemplo n.º 2

0

Mostrar archivo

    def qc_trim(self):
        try:
            for i in self.info_dict["Lanes"]:
                for k in self.info_dict["Number_of_seq"]:
                    r1 = re.compile(".*" + i + "_R1_" + k)
                    read1 = [
                        s + ".fastq.gz" for s in self.fastq_list if r1.match(s)
                    ]

                    r2 = re.compile(".*" + i + "_R2_" + k)
                    read2 = [
                        s + ".fastq.gz" for s in self.fastq_list if r2.match(s)
                    ]

                    gene_origin = self.info_dict["Sample_ID"][
                        0] + "_" + self.info_dict["Index"][
                            0] + "_" + i + "_" + k

                    command = self.paths.fastp + " -w " + self.thread + " --in1 " + read1[0] + " --in2 " + \
                              read2[0] + " --out1 trim_" + read1[0] + " --out2 trim_" + read2[0] + \
                              " --html " + gene_origin + ".html --json " + gene_origin + ".json"

                    log_command(command, "Fastp Trim", self.thread,
                                "Quality Control")
                    self.file_list.append(gene_origin + ".html")
                    self.file_list.append(gene_origin + ".json")
                    self.file_list.append("trim_" + str(read1[0]))
                    self.file_list.append("trim_" + str(read2[0]))
                    print(
                        "---------------------------------------------------")
                    print(self.file_list)
        except:
            pass

Ejemplo n.º 3

0

Mostrar archivo

Archivo: variant_annotation.py Proyecto: MBaysanLab/GenomicsPipeline

    def annovar_for_strelka(self, input_fs):
        print(input_fs)
        if type(input_fs) == list:
            for input_f in input_fs:
                input_file = self.working_directory + "/" + input_f
                header_f = input_f.replace("Strelka", "Strelka2")
                header_f1 = header_f.replace(".vcf", ".txt")
                header_output_file = self.working_directory + "/" + header_f1
                header_remove_comand = 'grep -v "##" ' + input_file + " | awk '" + '{print $1"\\t"$2"\\t"$2"\\t"$4"\\t"$5"\\t"$6"\\t"$7"\\t"$8"\\t"$9"\\t"$10"\\t"$11}' + "' > {}".format(
                    header_output_file)
                print(header_remove_comand)
                log_command(header_remove_comand, "Annovar", self.threads,
                            "Variant Annotation Preprocess")
                output_f = "Annovar_" + "_".join(header_f.split(".")[:-1])
                output_file = self.working_directory + "/" + output_f
                command = self.annovar_dir + " " + input_file + " " + self.humandb + \
                          " -buildver hg38 -out " + output_file + " -remove -protocol refGene,ensGene,knownGene," \
                                                                  "cytoBand" \
                                                                  ",exac03,avsnp150,dbnsfp35c,gme,gnomad_exome," \
                                                                  "clinvar_20180603,cosmic -operation " \
                                                                  "gx,gx,gx,r,f,f,f,f,f,f,f -nastring . -polish " \
                                                                  "-xreffile " + self.xref
                print(command)

                output_fs = glob.glob("*" + output_f + "*")

Ejemplo n.º 4

0

Mostrar archivo

Archivo: variant_calling.py Proyecto: sahinsrhn/GenomicPipeline

 def mutect_select_variant_snp(self, mutect_output):
     snp_output = "SNP_" + mutect_output.split("/")[-1]
     command = self.get_paths.gatk4_path + " SelectVariants -R " + self.ref_dir + " -V " + mutect_output + \
               " --select-type-to-include SNP -O " + snp_output
     print(command)
     log_command(command, "Mutect2", self.threads, "Select SNP Variants")
     print(snp_output)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: pre_processing.py Proyecto: MBaysanLab/GenomicsPipeline

    def merge_bams(self, info_dict, all_bam_files):
        print("preprocess merge bams ")
        print(all_bam_files)
        inputs_list = ""

        if self.split_chr == "Before":
            for i in all_bam_files:
                inputs_list = inputs_list + "I=" + i + " "
            index_start = all_bam_files[0].find("_Chr_")
            chr_a = all_bam_files[0][index_start:]
            ouput_name = self.map_type + "_" + info_dict["Sample_ID"][0] + "_MergedBAM" + chr_a
            merge_command = "java -XX:ParallelGCThreads=" + self.threads + \
                            " -jar " + self.get_paths.picard_path + " MergeSamFiles " + inputs_list + \
                            " O=" + ouput_name + " USE_THREADING=true"

            log_command(merge_command, "Merge Bams(Split Before)", self.threads, "PreProcessing")
            return ouput_name

        else:
            for i in all_bam_files:
                inputs_list = inputs_list + "I=" + i + " "
            ouput_name = self.map_type + "_" + info_dict["Sample_ID"][0] + "_MergedBAM.bam"
            merge_command = "java -XX:ParallelGCThreads=" + self.threads + \
                            " -jar " + self.get_paths.picard_path + " MergeSamFiles " + inputs_list + \
                            " O=" + ouput_name + " USE_THREADING=true"

            log_command(merge_command, "Merge Bams", self.threads, "PreProcessing")
            return ouput_name

Ejemplo n.º 6

0

Mostrar archivo

Archivo: mapping.py Proyecto: MBaysanLab/GenomicsPipeline

    def convert_sort(self, sort_gene_origin):
        """
        Function creates a sorted and indexed bam file from given bam file

        Parameters
        ----------
        sort_gene_origin: str
            Bam file's name that created by mapping algorithm

        """

        if self.map_type == "Novoalign":
            convert_sort = self.get_paths.novoalign + "novosort -m 16g -t . -c " + self.threads + " --removeduplicates --keeptags " + \
                           sort_gene_origin + " -i  -o SortedBAM_" + sort_gene_origin
            log_command(convert_sort, "Convert Sort", self.threads, "Mapping")
            self.file_list.append("SortedBAM_" + sort_gene_origin)
            self.file_list.append("SortedBAM_" + sort_gene_origin + ".bai")
        else:
            convert_sort = "samtools view -@" + self.threads + " -bS " + sort_gene_origin + " | samtools sort -@" + \
                           self.threads + " -o SortedBAM_" + sort_gene_origin
            log_command(convert_sort, "Convert Sort", self.threads, "Mapping")
            self.file_list.append("SortedBAM_" + sort_gene_origin)
            indexed = helpers.create_index("SortedBAM_" + sort_gene_origin,
                                           "Create Index", self.threads,
                                           "Mapping")
            self.file_list.append(indexed)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: variant_calling.py Proyecto: MBaysanLab/GenomicsPipeline

    def somaticsniper_caller(self):
        somaticsniper_output = self.working_directory + "/" + self.output_name + ".vcf"
        command = self.get_paths.somaticsniper + " -q 1 -L -G -Q 15 -s 0.01 -T 0.85 -N 2 -r 0.001 -n NORMAL -t TUMOR " \
                                                 "-F vcf -f  " + self.ref_dir + "  " + self.tumor_bam + "  " + \
                  self.germline_bam + " " + somaticsniper_output

        log_command(command, "Somatic Sniper", self.threads, "Variant Calling")

Ejemplo n.º 8

0

Mostrar archivo

Archivo: variant_annotation.py Proyecto: MBaysanLab/GenomicsPipeline

 def annovar_for_g37(self, input_fs):
     print(input_fs)
     if type(input_fs) == list:
         for input_f in input_fs:
             input_file = self.working_directory + "/" + input_f
             output_f = "Annovar_" + "_".join(input_f.split(".")[:-1])
             output_file = self.working_directory + "/" + output_f
             command = self.annovar_dir + " --vcfinput " + input_file + " " + self.humandb + \
                       " -buildver hg19 -out " + output_file + " -remove -protocol refGene," \
                                                               "cytoBand" \
                                                               ",exac03,gnomad211_exome,avsnp150,dbnsfp35a," \
                                                               "clinvar_20190305,intervar_20180118 -operation " \
                                                               "gx,r,f,f,f,f,f,f -nastring . -polish " \
                                                               "-xreffile " + self.xref
             print(command)
             log_command(command, "Annovar", self.threads,
                         "Variant Annotation")
             output_fs = glob.glob("*" + output_f + "*")
             self.file_list.extend(output_fs)
         helpers.create_folder(self.working_directory,
                               self.file_list,
                               step="Annovar",
                               folder_directory=self.working_directory)
     else:
         return False

Ejemplo n.º 9

0

Mostrar archivo

Archivo: variant_calling.py Proyecto: sahinsrhn/GenomicPipeline

 def strelka_caller(self):
     command = self.get_paths.strelka + " --normalBam " + self.germline_bam + " --tumorBam " + self.tumor_bam + \
               " --referenceFasta " + self.ref_dir + " --runDir " + self.working_directory + " --exome --disableEVS"
     log_command(command, "Strelka Create Workflow", self.threads,
                 "Variant Calling")
     run_workflow_command = "python runWorkflow.py -m local -j " + self.threads
     log_command(run_workflow_command, "Strelka Create Workflow",
                 self.threads, "Variant Calling")

Ejemplo n.º 10

0

Mostrar archivo

Archivo: variant_calling.py Proyecto: sahinsrhn/GenomicPipeline

 def varscan_caller_step2(self, intermediate_varscan_somatic):
     print(intermediate_varscan_somatic)
     for somatic in intermediate_varscan_somatic:
         command = "java -jar " + self.get_paths.varscan_path + " processSomatic " + somatic + \
                   " --min-tumor-freq 0.10 --max-normal-freq 0.05 --p-value 0.07"
         log_command(command, "Varscan Step Process Somatic", self.threads,
                     "Variant Calling")
     return glob.glob("*vcf*")

Ejemplo n.º 11

0

Mostrar archivo

Archivo: variant_calling.py Proyecto: sahinsrhn/GenomicPipeline

 def gatk_haplotype(self):
     haplotype_output = self.working_directory + "/" + self.output_name + ".vcf"
     command = "java -jar " + self.get_paths.gatk_path + " -R " + self.ref_dir + " -T HaplotypeCaller -I " + \
               self.germline_bam + " --dbsnp " + self.get_paths.dbsnp + \
               " -o " + haplotype_output + ".raw.snps.indels.vcf"
     print(command)
     log_command(command, "Haplotype", self.threads,
                 "Haplotype Variant Calling")

Ejemplo n.º 12

0

Mostrar archivo

 def fastqc(self):
     all_fastq_files = glob.glob("*fastq.gz")
     for fastq_file in all_fastq_files:
         file = self.working_directory + "/" + fastq_file
         command = self.paths.fastqc + " " + file
         log_command(command, "FastQC Quality Control", self.thread,
                     "Quality Control")
     fastqc_files = glob.glob("*fastqc*")
     self.file_list.extend(fastqc_files)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: variant_calling.py Proyecto: sahinsrhn/GenomicPipeline

 def mutect_caller_gatk3(self):
     mutect_output = self.working_directory + "/" + self.output_name  # Prepare output name
     nct = " -nct " + self.threads
     # Prepare the mutect variant caller command
     command = "java -jar " + self.get_paths.gatk_path + " -T MuTect2 " + nct + " -R " + self.ref_dir + \
               " -I:tumor " + self.tumor_bam + " -I:normal " + self.germline_bam + \
               " -o " + mutect_output
     print(command)
     log_command(command, "Mutect2", self.threads, "Variant Calling"
                 )  # "log_command" function run the command in terminal

Ejemplo n.º 14

0

Mostrar archivo

Archivo: gatk_pre_processing.py Proyecto: MBaysanLab/GenomicsPipeline

 def gatk3_base_recalibrator(self, lastbam):
     basequalityscore = str(lastbam).split(".")[0] + "_bqsr.grp"
     nct = " -nct " + str(self.threads)
     bcal = "java -jar " + self.get_paths.gatk_path + nct + " -T BaseRecalibrator -R " + self.bundle_dir +\
            "/ucsc.hg19.fasta -I " + lastbam + " -knownSites " + self.bundle_dir +\
            "/Mills_and_1000G_gold_standard.indels.hg19.vcf" + " -o " + basequalityscore
     log_command(bcal, "Base Recalibrator", self.threads,
                 "GatkPreProcessing")
     self.file_list.append(basequalityscore)
     return basequalityscore

Ejemplo n.º 15

0

Mostrar archivo

def split_bam_by_chr(file):
    split_command = "for file in " + file+ "; " \
              "do filename=`echo $file | cut -d \".\" -f 1`; " \
              "for chrom in `seq 1 22` X Y; do " \
              "samtools view -bh $file chr${chrom} > ${filename}_Chr_${chrom}.bam; done; done"
    print(split_command)

    log_command(split_command, "split by chrommose", "0", "PreProcessing")
    all_chr_files = glob.glob("*_Chr_*.bam")
    return all_chr_files

Ejemplo n.º 16

0

Mostrar archivo

Archivo: gatk_pre_processing.py Proyecto: MBaysanLab/GenomicsPipeline

 def gatk4_applybsqr(self, lastbam, recaltable):
     afterbqsrbam = "GATK4_" + lastbam
     apply_command = self.get_paths.gatk4_path + " ApplyBQSR -R " + self.bundle_dir + "Homo_sapiens_assembly38.fasta -I " + \
                     lastbam + " --bqsr-recal-file " + recaltable + " -O " + afterbqsrbam
     log_command(apply_command, "ApplyBQSR", self.threads,
                 "Gatk4PreProcessing")
     self.file_list.append(afterbqsrbam)
     indexed = helpers.create_index(afterbqsrbam,
                                    "Create Index by GATK_ApplyBSQR",
                                    self.threads, "GatkPreProcess")
     self.file_list.append(indexed)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: gatk_pre_processing.py Proyecto: MBaysanLab/GenomicsPipeline

    def gatk4_base_recalibrator(self, lastbam):
        recal_table = str(lastbam).split(".")[0] + "_RECAL.table"

        bcal = self.get_paths.gatk4_path + " BaseRecalibrator -R " + self.bundle_dir +\
               "Homo_sapiens_assembly38.fasta -I " + lastbam + " --known-sites " + self.get_paths.mills_indel +\
               " --known-sites " + self.get_paths.dbsnp + " --known-sites " + self.get_paths.one_thousand_g + " -O " +\
               recal_table
        log_command(bcal, "Base Recalibrator", self.threads,
                    "Gatk4PreProcessing")
        self.file_list.append(recal_table)
        return recal_table

Ejemplo n.º 18

0

Mostrar archivo

Archivo: gatk_pre_processing.py Proyecto: MBaysanLab/GenomicsPipeline

    def gatk3_indel_realigner(self, lastbam, realign_target):

        realigned_last_bam = "IR_" + lastbam
        bcal = "java -jar " + self.get_paths.gatk_path + " -T IndelRealigner -R " + self.bundle_dir + \
               "/ucsc.hg19.fasta -known " + self.bundle_dir + "/Mills_and_1000G_gold_standard.indels.hg19.vcf" + \
               " -targetIntervals " + realign_target + " --noOriginalAlignmentTags -I " + lastbam + " -o " + \
               realigned_last_bam

        log_command(bcal, "Indel Realigner", self.threads, "GatkPreProcessing")
        self.file_list.append(realigned_last_bam)
        return realigned_last_bam

Ejemplo n.º 19

0

Mostrar archivo

Archivo: gatk_pre_processing.py Proyecto: MBaysanLab/GenomicsPipeline

    def gatk3_print_reads(self, lastbam, bqsr):
        nct = " -nct " + str(self.threads)

        aftercalibratorBam = "GATK_PR" + lastbam
        bcal = "java -jar " + self.get_paths.gatk_path + nct + " -T PrintReads -R " + self.bundle_dir + \
               "/ucsc.hg19.fasta -I " + lastbam + " --BQSR " + bqsr + " -o " + aftercalibratorBam
        log_command(bcal, "Print Reads", self.threads, "GatkPreProcessing")
        self.file_list.append(aftercalibratorBam)
        indexed = helpers.create_index(aftercalibratorBam,
                                       "Create Index by GATK_PrintReads",
                                       self.threads, "GatkPreProcess")
        self.file_list.append(indexed)

Ejemplo n.º 20

0

Mostrar archivo

Archivo: gatk_pre_processing.py Proyecto: MBaysanLab/GenomicsPipeline

 def gatk3_realign_target_creator(self, lastbam):
     realign_target = str(lastbam).split(
         ".")[0] + "_realign_target.intervals"
     bcal = "java -jar " + self.get_paths.gatk_path + " -T RealignerTargetCreator -nt " + \
            self.threads + " -R " + self.bundle_dir + "/ucsc.hg19.fasta -known " + \
            self.bundle_dir + "/Mills_and_1000G_gold_standard.indels.hg19.vcf -I " + lastbam + \
            " -o " + realign_target
     print(bcal)
     log_command(bcal, "Realign Target Creator", self.threads,
                 "GatkPreProcessing")
     self.file_list.append(realign_target)
     return realign_target

Ejemplo n.º 21

0

Mostrar archivo

Archivo: pre_processing.py Proyecto: MBaysanLab/GenomicsPipeline

    def novoalign_sort_markduplicate(self, info_dict, all_bam_files):
        ouput_name = "MDUP_" + self.map_type + "_" + info_dict["Sample_ID"][0] + "_MergedBAM.bam"
        inputs_list = ""
        for a in all_bam_files:
            inputs_list += " " + a

        commands = self.get_paths.novoalign +"novosort  -m 16g -t . -c "+ self.threads +" " + inputs_list +" -i -o " + ouput_name
        log_command(commands, "Merge&Mark Duplicate", self.threads, "PreProcessing")
        self.file_list.append(ouput_name)
        self.file_list.append(ouput_name + ".bai")

        return ouput_name

Ejemplo n.º 22

0

Mostrar archivo

Archivo: variant_calling.py Proyecto: sahinsrhn/GenomicPipeline

 def mutect_tumor_only(self):
     mutect_output = self.working_directory + "/" + "TumorOnly_" + self.output_name + ".vcf"  # Prepare output name
     # "helpers.get_sample_name" function get sample names which is inside read group of bam file
     tumor_s_name = helpers.get_sample_name(self.tumor_bam)
     # Prepare the mutect variant caller command
     command = self.get_paths.gatk4_path + " Mutect2 -R " + self.ref_dir + " -I " + self.tumor_bam + " -tumor " + \
               tumor_s_name + " -O " + mutect_output
     print(command)
     log_command(command, "Mutect2", self.threads,
                 "Variant Calling Tumor Only"
                 )  # "log_command" function run the command in terminal
     self.mutect_select_variant(
         mutect_output)  # Separate variants to the SNPs and INDELs file

Ejemplo n.º 23

0

Mostrar archivo

Archivo: variant_calling.py Proyecto: sahinsrhn/GenomicPipeline

    def mutect_caller(self):
        mutect_output = self.working_directory + "/" + self.output_name + ".vcf"  # Prepare output name

        # "helpers.get_sample_name" function get sample names which is inside read group of bam file
        normal_s_name = helpers.get_sample_name(self.germline_bam)
        tumor_s_name = helpers.get_sample_name(self.tumor_bam)
        print(tumor_s_name)
        # Prepare the mutect variant caller command
        command = self.get_paths.gatk4_path + " --javaOptions\"-Xmx4g\" Mutect2 " + " -R " + self.ref_dir + " -I " + self.tumor_bam + " -tumor "\
                  + tumor_s_name + " -I " + self.germline_bam + " -normal " + normal_s_name + " -O " + mutect_output
        print(command)
        log_command(command, "Mutect2", self.threads, "Variant Calling"
                    )  # "log_command" function run the command in terminal
        self.mutect_select_variant(
            mutect_output)  # Separate variants to the SNPs and INDELs file

Ejemplo n.º 24

0

Mostrar archivo

    def convert_sort(self, sort_gene_origin):
        """
        Function creates a sorted and indexed bam file from given bam file

        Parameters
        ----------
        sort_gene_origin: str
            Bam file's name that created by mapping algorithm

        """
        convert_sort = "samtools view -@" + self.threads + " -bS " + sort_gene_origin + " | samtools sort -@" + \
                       self.threads + " -o SortedBAM_" + sort_gene_origin
        log_command(convert_sort, "Convert Sort", self.threads, "Mapping")
        self.file_list.append("SortedBAM_" + sort_gene_origin)
        indexed = helpers.create_index("SortedBAM_" + sort_gene_origin, "Create Index", self.threads, "Mapping")
        self.file_list.append(indexed)

Ejemplo n.º 25

0

Mostrar archivo

Archivo: variant_calling.py Proyecto: sahinsrhn/GenomicPipeline

    def varscan_caller_step1(self):

        snp_output = self.working_directory + "/SNP_" + self.output_name
        indel_output = self.working_directory + "/INDEL_" + self.output_name
        command = "samtools mpileup -f " + self.ref_dir + " -q 1 -B " + self.germline_bam + " " + \
                  self.tumor_bam + " | java -jar " + self.get_paths.varscan_path + " somatic --output-snp " \
                  + snp_output + " --output-indel " + indel_output + \
                  " --mpileup 1 --min-coverage 8 --min-coverage-normal 8 --min-coverage-tumor 6 --min-var-freq 0.10 " \
                  "--min-freq-for-hom 0.75 --normal-purity 1.0 --tumor-purity 1.00 --p-value 0.99 " \
                  "--somatic-p-value 0.05 " + "--strand-filter 0 --output-vcf"
        print(command)
        log_command(command, "Varscan Step Pileup", self.threads,
                    "Variant Calling")
        intermediate_varscan_somatic = glob.glob("*" + self.output_name +
                                                 "*vcf*")

        return intermediate_varscan_somatic

Ejemplo n.º 26

0

Mostrar archivo

Archivo: pre_processing.py Proyecto: MBaysanLab/GenomicsPipeline

    def mark_duplicate(self, merged_bam, chr):

        if self.split_chr == "After":
            mark_prefix_removed = "MDUP"
            output = mark_prefix_removed + "_" + merged_bam
            marked_dup_metrics = "marked_dup_metrics" + chr[:-4] + ".txt"
            picardcommand = "java -XX:ParallelGCThreads=" + self.threads + \
                            " -jar " + self.get_paths.picard_path + " MarkDuplicates I=" + merged_bam + \
                            " O=" + output + " M=" + marked_dup_metrics + " REMOVE_DUPLICATES=true " \
                                                              "CREATE_INDEX=true"
            log_command(picardcommand, "Mark Duplicate Split After", self.threads, "PreProcessing")
            self.file_list.append(marked_dup_metrics)
            return output

        elif self.split_chr == "Before":
            mark_prefix_removed = "MDUP"
            output = mark_prefix_removed + "_" + merged_bam
            marked_dup_metrics = "marked_dup_metrics" + chr[:-4] + ".txt"
            picardcommand = "java -XX:ParallelGCThreads=" + self.threads + \
                            " -jar " + self.get_paths.picard_path + " MarkDuplicates I=" + merged_bam + \
                            " O=" + output + " M=" + marked_dup_metrics + " REMOVE_DUPLICATES=true " \
                                                              "CREATE_INDEX=true"
            log_command(picardcommand, "Mark Duplicate Split Before", self.threads, "PreProcessing")
            self.file_list.append(marked_dup_metrics)
            return output
        else:
            mark_prefix_removed = "MDUP"
            output = mark_prefix_removed + "_" + merged_bam

            picardcommand = "java -XX:ParallelGCThreads=" + self.threads + \
                            " -jar " + self.get_paths.picard_path + " MarkDuplicates I=" + merged_bam + \
                            " O=" + output + " M=marked_dup_metrics.txt REMOVE_DUPLICATES=true CREATE_INDEX=true"
            log_command(picardcommand, "Mark Duplicate", self.threads, "PreProcessing")
            self.file_list.append("marked_dup_metrics.txt")
            return output

Ejemplo n.º 27

0

Mostrar archivo

def create_index(lastbam, function, threads, step):
    indexcol = "java -Dpicard.useLegacyParser=false -jar " + GetPaths(
    ).picard_path + " BuildBamIndex -I " + lastbam
    log_command(indexcol, function, threads, step)
    return lastbam[:-3] + "bai"

Ejemplo n.º 28

0

Mostrar archivo

Archivo: mapping.py Proyecto: MBaysanLab/GenomicsPipeline

    def mapping(self):
        """
        End of this function mapping job is done in terms of selected mapping algorithms Bwa or Bowtie2. There is 5
        important step in this function.
        - First is reading a fastq file first line in order to get information given by sequence machine.
        - Second thing is creating table by same group of paired-end reads and lanes for mapping.
        - Thirdly, adding a custom read group information and give it to mapping alghorithm. This information will be
        in bam files which are created in this step.
        - Fourthly, creating a complete script as string type.
        - Lastly, created script is given to linux terminal system. The key point is algorithms must be in path

        """
        print(os.getcwd())
        fastq_list = helpers.get_fastq()  # Get list of fastq files
        print(fastq_list)
        info_dict = helpers.get_info(
            self.sample_type, fastq_list,
            self.trim)  # Get neccesery information from filename
        # RG_{..} variables are created for prepare read group information.
        RG_SM = info_dict["Sample_ID"][0]
        RG_PL = "Illumina"
        RG_LB = self.library_matching_id
        # Each fastq file has flow cell information so just read one fastq file first line
        first_fastq_file_dir = self.working_directory + "/" + fastq_list[
            0] + ".fastq.gz"
        with gzip.open(first_fastq_file_dir) as f:
            first_line = f.readline()
        flowcell_info = str(first_line).split(":")[2]

        # Fastq files grouped by lane if there are more than one lane and grouped by how many sequence read there are.
        # i.e. SampleName_S1_L001_R1_001.fastq.gz , SampleName_S1_L002_R1_001.fastq.gz ,
        # SampleName_S1_L001_R2_001.fastq.gz , SampleName_S1_L002_R2_001.fastq.gz SampleName_S1_L001_R1_002.fastq.gz ,
        # SampleName_S1_L002_R1_002.fastq.gz , SampleName_S1_L001_R2_002.fastq.gz , SampleName_S1_L002_R2_0012.fastq.gz
        # grouped like => (SampleName_S1_L001_R1_001.fastq.gz, SampleName_S1_L001_R2_001.fastq.gz),
        # (SampleName_S1_L001_R1_001.fastq.gz, SampleName_S1_L002_R2_001.fastq.gz),
        # (SampleName_S1_L001_R1_002.fastq.gz, SampleName_S1_L001_R2_002.fastq.gz),
        # (SampleName_S1_L002_R1_002.fastq.gz, SampleName_S1_L002_R2_002.fastq.gz)
        for i in info_dict["Lanes"]:
            for k in info_dict["Number_of_seq"]:
                r1 = re.compile(".*" + i + "_R1_" + k)
                read1 = [s + ".fastq.gz" for s in fastq_list if r1.match(s)]

                r2 = re.compile(".*" + i + "_R2_" + k)
                read2 = [s + ".fastq.gz" for s in fastq_list if r2.match(s)]

                RG_ID = flowcell_info + "." + i[-1]
                RG_PU = flowcell_info + "." + info_dict["Index"][0] + "." + i[
                    -1]
                map_bam = ""

                # Create output name of bam file after mapping
                gene_origin = self.map_type + "_" + info_dict["Sample_ID"][
                    0] + "_" + info_dict["Index"][
                        0] + "_" + i + "_" + k + ".bam"

                if self.map_type == "Bwa":  # If selected algorithm is Bwa
                    add_read_group = ' -R "@RG\\tID:' + RG_ID + '\\tSM:' + RG_SM + '\\tLB:' + RG_LB + '\\tPL:' + \
                                     RG_PL + '\\tPU:' + RG_PU + '" '  # Read group created and will bed added bam file

                    map_bam = "bwa mem -t " + self.threads + " " + add_read_group + self.get_paths.ref_dir + \
                              "Bwa/Homo_sapiens_assembly38.fasta " + read1[0] + " " + read2[0] + \
                              " | samtools view -@" + self.threads + " -bS - > " + gene_origin
                    print("mapping =>" + map_bam)
                elif self.map_type == "Bowtie2":  # If selected algorithm is Bowtie2

                    add_read_group = " --rg-id " + RG_ID + " --rg SM:" + RG_SM + " --rg LB:" + RG_LB + " --rg PL:" + \
                                     RG_PL + " --rg PU:" + RG_PU  # Read group created and will bed added bam file

                    map_bam = "bowtie2 -p" + self.threads + add_read_group + " -x " + self.get_paths.ref_dir + \
                              "Bowtie2/Homo_sapiens_assembly38 -1 " + read1[0] + " -2 " + read2[0] + \
                              " | samtools view -@" + self.threads + " -bS - > " + gene_origin
                    print("mapping =>" + map_bam)

                elif self.map_type == "Novoalign":
                    add_read_group = ' "@RG\\tID:' + RG_ID + '\\tSM:' + RG_SM + '\\tLB:' + RG_LB + '\\tPL:' + \
                                     RG_PL + '\\tPU:' + RG_PU + '" '  # Read group created and will bed added bam file
                    stats_txt = gene_origin.split(".")[0] + "_stats.txt "
                    map_bam = self.get_paths.novoalign + "novoalign -k -d " + self.get_paths.ref_dir + "NovoAlign/Homo_sapiens_assembly38 -f " + \
                              read1[0] + " " +read2[0] + " -a -c " + self.threads + " -o SAM " + add_read_group + " 2> " + stats_txt + \
                              " | samtools view -@" + self.threads + " -bS - > " + gene_origin
                    print("mapping =>" + map_bam)

                else:
                    return "Please specify the map type Bwa/Bowtie "

                # This function run created algorithm's command created above in string for format in linux system.
                # The step, # of threads and class name added for keep logging purposes
                log_command(map_bam, "Mapping", self.threads, "Mapping")
                self.file_list.append(
                    gene_origin)  # Output file's name added to list
                self.convert_sort(
                    gene_origin
                )  # Each output bam file sorted and indexed with this function

        all_sortedbam_files = glob.glob(
            "SortedBAM*bam")  # Get all sorted bam files

        # Below helper function get working directory, list of files created in this step, maping type and step's name
        # in order to create folder for that particular step inside base on mapping file
        helpers.create_folder(self.working_directory,
                              self.file_list,
                              map_type=self.map_type,
                              step="Mapping",
                              folder_directory=self.folder_directory)
        print("print sorted all bam files ")
        print(all_sortedbam_files)
        return all_sortedbam_files  # Return list of sorted bam files