def annovar_for_g37(self, input_fs):
     print(input_fs)
     if type(input_fs) == list:
         for input_f in input_fs:
             input_file = self.working_directory + "/" + input_f
             output_f = "Annovar_" + "_".join(input_f.split(".")[:-1])
             output_file = self.working_directory + "/" + output_f
             command = self.annovar_dir + " --vcfinput " + input_file + " " + self.humandb + \
                       " -buildver hg19 -out " + output_file + " -remove -protocol refGene," \
                                                               "cytoBand" \
                                                               ",exac03,gnomad211_exome,avsnp150,dbnsfp35a," \
                                                               "clinvar_20190305,intervar_20180118 -operation " \
                                                               "gx,r,f,f,f,f,f,f -nastring . -polish " \
                                                               "-xreffile " + self.xref
             print(command)
             log_command(command, "Annovar", self.threads,
                         "Variant Annotation")
             output_fs = glob.glob("*" + output_f + "*")
             self.file_list.extend(output_fs)
         helpers.create_folder(self.working_directory,
                               self.file_list,
                               step="Annovar",
                               folder_directory=self.working_directory)
     else:
         return False
Beispiel #2
0
 def run_qc(self):
     self.fastqc()
     self.qc_trim()
     helpers.create_folder(self.working_directory,
                           self.file_list,
                           step="QC",
                           map_type=self.map_type)
Beispiel #3
0
    def mapping(self):
        """
        End of this function mapping job is done in terms of selected mapping algorithms Bwa or Bowtie2. There is 5
        important step in this function.
        - First is reading a fastq file first line in order to get information given by sequence machine.
        - Second thing is creating table by same group of paired-end reads and lanes for mapping.
        - Thirdly, adding a custom read group information and give it to mapping alghorithm. This information will be
        in bam files which are created in this step.
        - Fourthly, creating a complete script as string type.
        - Lastly, created script is given to linux terminal system. The key point is algorithms must be in path

        """
        print(os.getcwd())
        fastq_list = helpers.get_fastq()  # Get list of fastq files
        print(fastq_list)
        info_dict = helpers.get_info(
            self.sample_type, fastq_list,
            self.trim)  # Get neccesery information from filename
        # RG_{..} variables are created for prepare read group information.
        RG_SM = info_dict["Sample_ID"][0]
        RG_PL = "Illumina"
        RG_LB = self.library_matching_id
        # Each fastq file has flow cell information so just read one fastq file first line
        first_fastq_file_dir = self.working_directory + "/" + fastq_list[
            0] + ".fastq.gz"
        with gzip.open(first_fastq_file_dir) as f:
            first_line = f.readline()
        flowcell_info = str(first_line).split(":")[2]

        # Fastq files grouped by lane if there are more than one lane and grouped by how many sequence read there are.
        # i.e. SampleName_S1_L001_R1_001.fastq.gz , SampleName_S1_L002_R1_001.fastq.gz ,
        # SampleName_S1_L001_R2_001.fastq.gz , SampleName_S1_L002_R2_001.fastq.gz SampleName_S1_L001_R1_002.fastq.gz ,
        # SampleName_S1_L002_R1_002.fastq.gz , SampleName_S1_L001_R2_002.fastq.gz , SampleName_S1_L002_R2_0012.fastq.gz
        # grouped like => (SampleName_S1_L001_R1_001.fastq.gz, SampleName_S1_L001_R2_001.fastq.gz),
        # (SampleName_S1_L001_R1_001.fastq.gz, SampleName_S1_L002_R2_001.fastq.gz),
        # (SampleName_S1_L001_R1_002.fastq.gz, SampleName_S1_L001_R2_002.fastq.gz),
        # (SampleName_S1_L002_R1_002.fastq.gz, SampleName_S1_L002_R2_002.fastq.gz)
        for i in info_dict["Lanes"]:
            for k in info_dict["Number_of_seq"]:
                r1 = re.compile(".*" + i + "_R1_" + k)
                read1 = [s + ".fastq.gz" for s in fastq_list if r1.match(s)]

                r2 = re.compile(".*" + i + "_R2_" + k)
                read2 = [s + ".fastq.gz" for s in fastq_list if r2.match(s)]

                RG_ID = flowcell_info + "." + i[-1]
                RG_PU = flowcell_info + "." + info_dict["Index"][0] + "." + i[
                    -1]
                map_bam = ""

                # Create output name of bam file after mapping
                gene_origin = self.map_type + "_" + info_dict["Sample_ID"][
                    0] + "_" + info_dict["Index"][
                        0] + "_" + i + "_" + k + ".bam"

                if self.map_type == "Bwa":  # If selected algorithm is Bwa
                    add_read_group = ' -R "@RG\\tID:' + RG_ID + '\\tSM:' + RG_SM + '\\tLB:' + RG_LB + '\\tPL:' + \
                                     RG_PL + '\\tPU:' + RG_PU + '" '  # Read group created and will bed added bam file

                    map_bam = "bwa mem -t " + self.threads + " " + add_read_group + self.get_paths.ref_dir + \
                              "Bwa/Homo_sapiens_assembly38.fasta " + read1[0] + " " + read2[0] + \
                              " | samtools view -@" + self.threads + " -bS - > " + gene_origin
                    print("mapping =>" + map_bam)
                elif self.map_type == "Bowtie2":  # If selected algorithm is Bowtie2

                    add_read_group = " --rg-id " + RG_ID + " --rg SM:" + RG_SM + " --rg LB:" + RG_LB + " --rg PL:" + \
                                     RG_PL + " --rg PU:" + RG_PU  # Read group created and will bed added bam file

                    map_bam = "bowtie2 -p" + self.threads + add_read_group + " -x " + self.get_paths.ref_dir + \
                              "Bowtie2/Homo_sapiens_assembly38 -1 " + read1[0] + " -2 " + read2[0] + \
                              " | samtools view -@" + self.threads + " -bS - > " + gene_origin
                    print("mapping =>" + map_bam)

                elif self.map_type == "Novoalign":
                    add_read_group = ' "@RG\\tID:' + RG_ID + '\\tSM:' + RG_SM + '\\tLB:' + RG_LB + '\\tPL:' + \
                                     RG_PL + '\\tPU:' + RG_PU + '" '  # Read group created and will bed added bam file
                    stats_txt = gene_origin.split(".")[0] + "_stats.txt "
                    map_bam = self.get_paths.novoalign + "novoalign -k -d " + self.get_paths.ref_dir + "NovoAlign/Homo_sapiens_assembly38 -f " + \
                              read1[0] + " " +read2[0] + " -a -c " + self.threads + " -o SAM " + add_read_group + " 2> " + stats_txt + \
                              " | samtools view -@" + self.threads + " -bS - > " + gene_origin
                    print("mapping =>" + map_bam)

                else:
                    return "Please specify the map type Bwa/Bowtie "

                # This function run created algorithm's command created above in string for format in linux system.
                # The step, # of threads and class name added for keep logging purposes
                log_command(map_bam, "Mapping", self.threads, "Mapping")
                self.file_list.append(
                    gene_origin)  # Output file's name added to list
                self.convert_sort(
                    gene_origin
                )  # Each output bam file sorted and indexed with this function

        all_sortedbam_files = glob.glob(
            "SortedBAM*bam")  # Get all sorted bam files

        # Below helper function get working directory, list of files created in this step, maping type and step's name
        # in order to create folder for that particular step inside base on mapping file
        helpers.create_folder(self.working_directory,
                              self.file_list,
                              map_type=self.map_type,
                              step="Mapping",
                              folder_directory=self.folder_directory)
        print("print sorted all bam files ")
        print(all_sortedbam_files)
        return all_sortedbam_files  # Return list of sorted bam files
    def pre_process(self, info_dict, all_bam_files):
        if self.split_chr == "After":
            merged_file = self.merge_bams(info_dict, all_bam_files)
            self.file_list.append(merged_file)
            indexed = helpers.create_index(merged_file, "Create Index by Merge", self.threads, "Pre Processing")
            self.file_list.append(indexed)
            splitted_files = split_bam_by_chr(merged_file)
            for splitted_file in splitted_files:
                index_start = splitted_file.find("_Chr_")
                chr_a = splitted_file[index_start:]
                mark_duplicate_file = self.mark_duplicate(splitted_file, chr_a)
                self.file_list.append(mark_duplicate_file)
                indexed = helpers.create_index(mark_duplicate_file, "Create Index by MarkDuplicate", self.threads,
                                     "Pre Processing")
                self.file_list.append(indexed)
            helpers.create_folder(self.working_directory, self.file_list, map_type=self.map_type, step="PreProcess",
                                  folder_directory=self.folder_directory)
            return_files = [a for a in self.file_list if "MDUP" in a and "bam" in a]
            return return_files

        elif self.split_chr == "Before":
            for bam_file in all_bam_files:
                splitted_files = split_bam_by_chr(bam_file)
            all_chr_files = get_bam_by_chr()
            print("preprocess line 128" )
            print( all_chr_files)
            for i in all_chr_files:
                merged_file = self.merge_bams(info_dict, all_chr_files[i])
                self.file_list.append(merged_file)
                indexed = helpers.create_index(merged_file, "Create Index by Merge", self.threads, "Pre Processing")
                self.file_list.append(indexed)
                index_start = all_chr_files[i][0].find("_Chr_")
                chr_a = all_chr_files[i][0][index_start:]
                mark_duplicate_file = self.mark_duplicate(merged_file, chr_a)
                self.file_list.append(mark_duplicate_file)
                indexed = helpers.create_index(mark_duplicate_file, "Create Index by MarkDuplicate", self.threads,
                                     "Pre Processing")
                self.file_list.append(indexed)
                helpers.create_folder(self.working_directory, self.file_list, map_type=self.map_type, step="PreProcess",
                                      folder_directory=self.folder_directory)
            return_files = [a for a in self.file_list if "MDUP" in a and "bam" in a]
            return return_files

        # self.split_chr == "No":
        else:
            if self.map_type == "Novoalign":
                mark_duplicate_file = self.novoalign_sort_markduplicate(info_dict, all_bam_files)
                #self.file_list.append(indexed)
                helpers.create_folder(self.working_directory, self.file_list, map_type=self.map_type, step="PreProcess",
                                      folder_directory=self.folder_directory)
                return mark_duplicate_file

            merged_file = self.merge_bams(info_dict, all_bam_files)
            indexed = helpers.create_index(merged_file, "Create Index by Merge", self.threads, "Pre Processing")
            self.file_list.append(merged_file)
            self.file_list.append(indexed)
            mark_duplicate_file = self.mark_duplicate(merged_file,"")
            print("preprocess mark duplicate file " )
            print(mark_duplicate_file)
            self.file_list.append(mark_duplicate_file)
            indexed = helpers.create_index(mark_duplicate_file, "Create Index by MarkDuplicate", self.threads,
                                 "Pre Processing")
            self.file_list.append(indexed)
            helpers.create_folder(self.working_directory, self.file_list, map_type=self.map_type, step="PreProcess",
                                  folder_directory=self.folder_directory)
            return mark_duplicate_file


# if __name__ == "__main__":
#     pre_processing_step = PreProcessing(working_directory="/home/bioinformaticslab/Desktop/GitHub_Repos/Genomics_Pipeline_Test/test_files",
#                            map_type="Bwa", sample_type="Tumor", library_matching_id="203", thrds="1", issplitchr="Before")
#
#     mapping_step = mapping.Mapping(working_directory=pre_processing_step.main_directory,
#         map_type="Bwa", sample_type="Tumor", library_matching_id="203", thrds="3")
#
#     fastq_list = mapping_step.get_fastq()
#     info_dict = mapping_step.get_info(fastq_list)
#     os.chdir(pre_processing_step.working_directory)
#     bam_files = glob.glob("SortedBAM*.bam")
#     mark_duplicate_file = pre_processing_step.pre_process(info_dict, bam_files)
#     print(mark_duplicate_file)
    def pre_process(self, info_dict, all_bam_files):
        if self.split_chr == "After":
            merged_file = self.merge_bams(info_dict, all_bam_files)
            self.file_list.append(merged_file)
            indexed = helpers.create_index(merged_file,
                                           "Create Index by Merge",
                                           self.threads, "Pre Processing")
            self.file_list.append(indexed)
            splitted_files = split_bam_by_chr(merged_file)
            for splitted_file in splitted_files:
                index_start = splitted_file.find("_Chr_")
                chr_a = splitted_file[index_start:]
                mark_duplicate_file = self.mark_duplicate(splitted_file, chr_a)
                self.file_list.append(mark_duplicate_file)
                indexed = helpers.create_index(
                    mark_duplicate_file, "Create Index by MarkDuplicate",
                    self.threads, "Pre Processing")
                self.file_list.append(indexed)
            helpers.create_folder(self.working_directory,
                                  self.file_list,
                                  map_type=self.map_type,
                                  step="PreProcess",
                                  folder_directory=self.folder_directory)
            return_files = [
                a for a in self.file_list if "MDUP" in a and "bam" in a
            ]
            return return_files

        elif self.split_chr == "Before":
            for bam_file in all_bam_files:
                splitted_files = split_bam_by_chr(bam_file)
            all_chr_files = get_bam_by_chr()
            print("preprocess line 128")
            print(all_chr_files)
            for i in all_chr_files:
                merged_file = self.merge_bams(info_dict, all_chr_files[i])
                self.file_list.append(merged_file)
                indexed = helpers.create_index(merged_file,
                                               "Create Index by Merge",
                                               self.threads, "Pre Processing")
                self.file_list.append(indexed)
                index_start = all_chr_files[i][0].find("_Chr_")
                chr_a = all_chr_files[i][0][index_start:]
                mark_duplicate_file = self.mark_duplicate(merged_file, chr_a)
                self.file_list.append(mark_duplicate_file)
                indexed = helpers.create_index(
                    mark_duplicate_file, "Create Index by MarkDuplicate",
                    self.threads, "Pre Processing")
                self.file_list.append(indexed)
                helpers.create_folder(self.working_directory,
                                      self.file_list,
                                      map_type=self.map_type,
                                      step="PreProcess",
                                      folder_directory=self.folder_directory)
            return_files = [
                a for a in self.file_list if "MDUP" in a and "bam" in a
            ]
            return return_files

        # self.split_chr == "No":
        else:
            merged_file = self.merge_bams(info_dict, all_bam_files)
            indexed = helpers.create_index(merged_file,
                                           "Create Index by Merge",
                                           self.threads, "Pre Processing")
            self.file_list.append(merged_file)
            self.file_list.append(indexed)
            mark_duplicate_file = self.mark_duplicate(merged_file, "")
            print("preprocess mark duplicate file ")
            print(mark_duplicate_file)
            self.file_list.append(mark_duplicate_file)
            indexed = helpers.create_index(mark_duplicate_file,
                                           "Create Index by MarkDuplicate",
                                           self.threads, "Pre Processing")
            self.file_list.append(indexed)
            helpers.create_folder(self.working_directory,
                                  self.file_list,
                                  map_type=self.map_type,
                                  step="PreProcess",
                                  folder_directory=self.folder_directory)
            return mark_duplicate_file
Beispiel #6
0
def main(argv=None):
    '''Command line options.'''

    program_info = """ABOUT
    
    The script creates images of light curves in specified folder. Output images 
    are stored by default in the folder of light curve files into "images", but
    path to another directory can be specified.
    
    Note:
    -----
        All paths are relative to location of execution.    
    
    Example:
    --------
        ./src/bin/plot_lcs.py -p data/light_curves/some_stars -o my_images/nonvar_stars
    """

    program_name = os.path.basename(sys.argv[0])
    program_version = "v0.1"
    program_build_date = "%s" % __updated__

    program_version_string = '%%prog %s (%s)' % (program_version,
                                                 program_build_date)
    program_longdesc = "Run script without paramas to get info about the program."
    program_license = "Copyright 2016 Martin Vo"

    if argv is None:
        argv = sys.argv[1:]
    try:
        # setup option parser
        parser = OptionParser(version=program_version_string,
                              epilog=program_longdesc,
                              description=program_license)
        parser.add_option(
            "-o",
            "--output",
            dest="output",
            default=None,
            help="Relative path to the folder where images will be saved")
        parser.add_option("-p",
                          "--path",
                          dest="Relative path to the folder of light curves",
                          help="")

        # process options
        opts, args = parser.parse_args(argv)

        if not len(argv):
            print(program_info, "\n")
            print("Run with '-h' in order to show params help\n")
            return False

        if opts.path:
            path = opts.path
        else:
            raise InvalidFilesPath("There is no path %s" % opts.path)

        if opts.output:
            save_path = opts.output
        else:
            save_path = os.path.join(path, "images")

        stars = StarsProvider().getProvider(obtain_method="FileManager",
                                            obtain_params={
                                                "path": "HERE:%s" % path
                                            }).getStarsWithCurves()

        print(
            "\n\nThere are %i stars in the folder which will be plotted into %s.\nThis will take a while..."
            % (len(stars), save_path))
        create_folder(save_path)
        plotStarsPicture(stars, option="save", save_loc=save_path)

        print("\n%s\nImages of light curves in %s were saved into %s" %
              ("=" * 20, path, save_path))

    except Exception as e:
        indent = len(program_name) * " "
        sys.stderr.write(program_name + ": " + repr(e) + "\n")
        sys.stderr.write(indent + "  for help use --help")
        return 2
    def run_pipeline(self):
        if self.tumor_only_mode:  # If tumor only variant caller is selected
            if self.v_caller == "Mutect2":
                self.mutect_tumor_only()
                files = glob.glob("*.vcf*")
                helpers.create_folder(self.working_directory,
                                      files,
                                      map_type=self.map_type,
                                      step="Mutect2",
                                      folder_directory=self.folder_directory)
                return self.folder_directory + "/" + "Mutect2"
        else:  # Tumor and Germline bam
            if self.v_caller == "Mutect2":
                self.mutect_caller()
                files = glob.glob("*.vcf*")
                helpers.create_folder(self.working_directory,
                                      files,
                                      map_type=self.map_type,
                                      step="Mutect2",
                                      folder_directory=self.folder_directory)
                return self.folder_directory + "/" + "Mutect2"

            elif self.v_caller == "Varscan":
                self.varscan_caller()
                files = glob.glob("*.vcf*")
                helpers.create_folder(self.working_directory,
                                      files,
                                      map_type=self.map_type,
                                      step="Varscan",
                                      folder_directory=self.folder_directory)
                return self.folder_directory + "/" + "Varscan"

            elif self.v_caller == "Mutect2_gatk3":
                self.mutect_caller_gatk3()
                files = glob.glob("*.vcf*")
                helpers.create_folder(self.working_directory,
                                      files,
                                      map_type=self.map_type,
                                      step="Mutect2_GATK3",
                                      folder_directory=self.folder_directory)
                return self.folder_directory + "/" + "Mutect2_GATK3"

            elif self.v_caller == "Haplotype":
                self.gatk_haplotype()
                files = glob.glob("*.vcf*")
                helpers.create_folder(self.working_directory,
                                      files,
                                      map_type=self.map_type,
                                      step="Haplotype",
                                      folder_directory=self.folder_directory)
                return self.folder_directory + "/" + "Haplotype"

            elif self.v_caller == "Strelka":
                self.strelka_caller()
                helpers.create_strelka_folder(self.folder_directory,
                                              self.output_name)

                return self.folder_directory + "/" + "Strelka"
            else:
                return False
        return False