Exemple #1
0
    def index(self,
              genome_dir,
              genome_fasta,
              annotation_gtf=None,
              feature_from_gtf_to_use_as_exon=None,
              junction_tab_file=None,
              sjdboverhang=None,
              genomeSAindexNbases=None,
              genomeChrBinNbits=None,
              genome_size=None):

        FileRoutines.safe_mkdir(genome_dir)

        options = "--runMode genomeGenerate"
        options += " --genomeDir %s" % os.path.abspath(genome_dir)
        options += " --runThreadN %i" % self.threads
        options += " --genomeFastaFiles %s" % (
            os.path.abspath(genome_fasta) if isinstance(genome_fasta, str) else
            " ".join(map(os.path.abspath, genome_fasta)))
        options += " --sjdbGTFfile %s" % annotation_gtf if annotation_gtf else ""
        options += " --sjdbFileChrStartEnd %s" % junction_tab_file if junction_tab_file else ""
        options += " --sjdbOverhang %i" % sjdboverhang if sjdboverhang else ""  # number of bases taken from both sides of splice junction. 100 by default
        if genome_size:
            options += " --genomeSAindexNbases %i" % min(
                [14, (floor(log(genome_size, 2) / 2)) - 1])
        else:
            options += " --genomeSAindexNbases %i" % genomeSAindexNbases if genomeSAindexNbases else ""  # size of k-mers used for preindexing of suffix array
        options += " --genomeChrBinNbits %i" % genomeChrBinNbits if genomeChrBinNbits else ""  # padding size (log2) of reference sequences. 18 by default
        # recommended value min(18, log2(GenomeLength/NumberOfScaffolds))
        options += " --sjdbGTFfeatureExon %s" % feature_from_gtf_to_use_as_exon if feature_from_gtf_to_use_as_exon else ""
        self.execute(options)
Exemple #2
0
    def parallel_align(self, list_of_files, output_dir, msa_tool='prank',
                       seq_type=None, bootstrap_number=100, genetic_code=1, threads=None,
                       msa_tool_options=None, seq_cutoff=None, col_cutoff=None, mafft_bin=None,
                       prank_bin=None, muscle_bin=None, pagan_bin=None, ruby_bin=None, program=None,
                       cmd_log_file=None,
                       cpus_per_task=1,
                       handling_mode="local",
                       job_name=None,
                       log_prefix=None,
                       error_log_prefix=None,
                       max_jobs=None,
                       max_running_time=None,
                       max_memory_per_node=None,
                       max_memmory_per_cpu=None,
                       modules_list=None,
                       environment_variables_dict=None):

        common_options = self.parse_common_options(output_dir=output_dir, msa_tool=msa_tool,
                                                   seq_type=seq_type, bootstrap_number=bootstrap_number,
                                                   genetic_code=genetic_code, threads=threads,
                                                   msa_tool_options=msa_tool_options, seq_cutoff=seq_cutoff,
                                                   col_cutoff=col_cutoff, mafft_bin=mafft_bin,
                                                   prank_bin=prank_bin, muscle_bin=muscle_bin,
                                                   pagan_bin=pagan_bin, ruby_bin=ruby_bin, program=program)

        FileRoutines.safe_mkdir(output_dir)
        options_list = []
        for filename in list_of_files:
            basename = FileRoutines.split_filename(filename)[1]
            op = common_options
            op += " --seqFile %s" % filename
            op += " --dataset %s" % basename
            options_list.append(op)
        if handling_mode == "local":
            self.parallel_execute(options_list)
        elif handling_mode == "slurm":

            cmd_list = ["%s%s %s" % ((self.path + "/") if self.path else "", self.cmd, options) for options in options_list]
            self.slurm_run_multiple_jobs_in_wrap_mode(cmd_list,
                                                      cmd_log_file,
                                                      max_jobs=max_jobs,
                                                      job_name=job_name,
                                                      log_prefix=log_prefix,
                                                      error_log_prefix=error_log_prefix,
                                                      cpus_per_node=None,
                                                      max_running_jobs=None,
                                                      max_running_time=max_running_time,
                                                      cpus_per_task=cpus_per_task,
                                                      max_memory_per_node=max_memory_per_node,
                                                      max_memmory_per_cpu=max_memmory_per_cpu,
                                                      modules_list=modules_list,
                                                      environment_variables_dict=environment_variables_dict)
Exemple #3
0
    def parallel_align(self, list_of_files, output_directory, output_suffix=None, tree_file=None, output_format=None, show_xml=None,
                       show_tree=None, show_ancestral_sequences=None, show_evolutionary_events=None,
                       showall=None, compute_posterior_support=None, njtree=None, skip_insertions=False,
                       codon_alignment=None, translated_alignment=None,
                       cmd_log_file=None,
                       cpus_per_task=1,
                       handling_mode="local",
                       job_name=None,
                       log_prefix=None,
                       error_log_prefix=None,
                       max_jobs=None,
                       max_running_time=None,
                       max_memory_per_node=None,
                       max_memmory_per_cpu=None,
                       modules_list=None,
                       environment_variables_dict=None):

        common_options = self.parse_common_options(tree_file=tree_file, output_format=output_format, show_xml=show_xml,
                                                   show_tree=show_tree, show_ancestral_sequences=show_ancestral_sequences,
                                                   show_evolutionary_events=show_evolutionary_events, showall=showall,
                                                   compute_posterior_support=compute_posterior_support, njtree=njtree,
                                                   skip_insertions=skip_insertions, codon_alignment=codon_alignment,
                                                   translated_alignment=translated_alignment)

        FileRoutines.safe_mkdir(output_directory)
        options_list = []
        for filename in list_of_files:
            basename = FileRoutines.split_filename(filename)[1]
            op = common_options
            op += " -d=%s" % filename
            op += " -o=%s/%s.fasta" % (output_directory,
                                       ("%s_%s" % (basename, output_suffix)) if output_suffix else basename)
            options_list.append(op)
        if handling_mode == "local":
            self.parallel_execute(options_list)
        elif handling_mode == "slurm":

            cmd_list = ["%s%s %s" % ((self.path + "/") if self.path else "", self.cmd, options) for options in options_list]
            self.slurm_run_multiple_jobs_in_wrap_mode(cmd_list,
                                                      cmd_log_file,
                                                      max_jobs=max_jobs,
                                                      job_name=job_name,
                                                      log_prefix=log_prefix,
                                                      error_log_prefix=error_log_prefix,
                                                      cpus_per_node=None,
                                                      max_running_jobs=None,
                                                      max_running_time=max_running_time,
                                                      cpus_per_task=cpus_per_task,
                                                      max_memory_per_node=max_memory_per_node,
                                                      max_memmory_per_cpu=max_memmory_per_cpu,
                                                      modules_list=modules_list,
                                                      environment_variables_dict=environment_variables_dict)
 def read(self, input_file, filetype="fasta", verbose=False):
     list_of_files = FileRoutines.make_list_of_path_to_files(input_file)
     for filename in list_of_files:
         if verbose:
             print("Parsing %s ..." % filename)
         directory, basename, extension = FileRoutines.split_filename(filename)
         try:
             self.records[basename] = MultipleAlignmentStatRecord(basename, alignment=AlignIO.read(filename, filetype))
             self.record_id_list.append(basename)
         except:
             raise ValueError("ERROR: Issues while parsing or calculating stats for %s!!!" % filename)
     # collectiontype-dependent function
     pass
Exemple #5
0
def RepeatModeler_search(query_file, db_name, output_file="run.out",
                         num_of_threads=5, RepeatModeler_dir=""):
    print("\nRepeatModeler search...\n")
    repmod_dir = FileRoutines.check_path(RepeatModeler_dir)
    os.system(repmod_dir + "BuildDatabase -engine ncbi  -name %s %s" % (db_name, query_file))
    os.system(repmod_dir + "RepeatModeler -engine ncbi -pa %i -database %s > %s"
              % (num_of_threads, db_name, output_file))
Exemple #6
0
def TRF_search(query_file, match=2, mismatch=7, delta=7, PM=80,
               PI=10, minscore=50, max_period=500, flanked=False, TRF_dir=""):

    print("\nTRF search...\n")
    #use: trf File Match Mismatch Delta PM PI Minscore MaxPeriod [options]
    #Where: (all weights, penalties, and scores are positive)
    # File = sequences input file
    # Match = matching weight
    # Mismatch = mismatching penalty
    # Delta = indel penalty
    # PM = match probability (whole number)
    # PI = indel probability (whole number)
    # Minscore = minimum alignment score to report
    # MaxPeriod = maximum period size to report
    # [options] = one or more of the following :
    # -m masked sequence file
    # -f flanking sequence
    # -d data file
    # -h suppress HTML output
    #Recomended options: trf yoursequence.txt 2 7 7 80 10 50 500 -f -d -m
    flanking = ""
    if flanked:
        flanking = "-f"

    trf_path = FileRoutines.check_path(TRF_dir)
    os.system(trf_path + "trf %s %i %i %i %i %i %i %i %s -d -m"
              % (query_file, match, mismatch, delta, PM, PI, minscore, max_period, flanking))
Exemple #7
0
    def parallel_align(self,
                       list_of_files,
                       output_directory,
                       output_suffix="alignment",
                       gap_open_penalty=None,
                       offset=None,
                       maxiterate=None,
                       quiet=True,
                       mode="globalpair",
                       number_of_processes=1,
                       anysymbol=False):
        # TODO: add rest of options

        options = " --thread %i" % self.threads
        options += " --op %f" % gap_open_penalty if gap_open_penalty is not None else ""
        options += " --ep %f" % offset if offset is not None else ""
        options += " --maxiterate %i" % maxiterate if maxiterate is not None else ""
        options += " --quiet" if quiet else ""
        options += " --%s" % mode
        options += " --anysymbol" if anysymbol else ""
        options_list = []
        for filename in list_of_files:
            basename = FileRoutines.split_filename(filename)[1]
            op = options
            op += " %s" % filename
            op += " > %s/%s.fasta" % (output_directory,
                                      ("%s_%s" % (basename, output_suffix))
                                      if output_suffix else basename)
            options_list.append(op)

        self.parallel_execute(options_list, threads=number_of_processes)
Exemple #8
0
    def parallel_blast(self, blast_command, seqfile, database, outfile=None,
                       blast_options=None, split_dir="splited_fasta",
                       splited_output_dir="splited_output_dir",
                       evalue=None, output_format=None,
                       threads=None, num_of_seqs_per_scan=None,
                       combine_output_to_single_file=True,
                       async_run=False,
                       external_process_pool=None):

        splited_dir = FileRoutines.check_path(split_dir)
        splited_out_dir = FileRoutines.check_path(splited_output_dir)
        self.safe_mkdir(splited_dir)
        self.safe_mkdir(splited_out_dir)

        number_of_files = num_of_seqs_per_scan if num_of_seqs_per_scan else 5 * threads if threads else 5 * self.threads
        self.split_fasta(seqfile, splited_dir, num_of_files=number_of_files)
        input_list_of_files = sorted(os.listdir(splited_dir))
        list_of_files = []

        for filename in input_list_of_files:
            filename_prefix = FileRoutines.split_filename(filename)[1]

            input_file = "%s%s" % (splited_dir, filename)
            output_file = "%s%s.hits" % (splited_out_dir, filename_prefix)

            list_of_files.append((input_file, output_file))

        options_list = []
        out_files = []

        for in_file, out_filename in list_of_files:

            options = " -out %s" % out_filename

            options += " -db %s" % database
            options += " -query %s" % in_file
            options += " %s" % blast_options if blast_options else ""
            options += " -evalue %s" % evalue if evalue else ""
            options += " -outfmt %i" % output_format if output_format else ""
            options_list.append(options)
            out_files.append(out_filename)

        self.parallel_execute(options_list, cmd=blast_command, threads=threads, async_run=async_run,
                              external_process_pool=external_process_pool)

        if combine_output_to_single_file:
            CGAS.cat(out_files, output=outfile)
Exemple #9
0
    def read(self, vep_tab_file):

        with FileRoutines.metaopen(vep_tab_file, "r") as in_fd:
            metadata = []
            while True:
                line = in_fd.readline()
                if line[:19] == "#Uploaded_variation":
                    header_list = line.strip()[1:].split("\t")
                    break
                metadata.append(line)

            return pd.read_csv(in_fd, sep='\t', names=header_list)
Exemple #10
0
    def extract_proteins_from_selected_families(
            families_id_file,
            fam_file,
            pep_file,
            output_dir="./",
            pep_format="fasta",
            out_prefix=None,
            create_dir_for_each_family=False):
        from RouToolPa.Routines import SequenceRoutines

        fam_id_list = IdList()
        fam_dict = SynDict()
        #print(pep_file)
        FileRoutines.safe_mkdir(output_dir)
        out_dir = FileRoutines.check_path(output_dir)
        create_directory_for_each_family = True if out_prefix else create_dir_for_each_family
        if families_id_file:
            fam_id_list.read(families_id_file)
        fam_dict.read(fam_file, split_values=True, values_separator=",")
        protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format)

        for fam_id in fam_id_list if families_id_file else fam_dict:
            if fam_id in fam_dict:
                if create_directory_for_each_family:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    FileRoutines.safe_mkdir(fam_dir)
                    out_file = "%s%s.pep" % (fam_dir, out_prefix
                                             if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.pep" % (out_dir, out_prefix
                                              if out_prefix else fam_id)

                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    protein_dict, fam_dict[fam_id], verbose=True),
                            out_file,
                            format=pep_format)
            else:
                print("%s was not found" % fam_id)

        os.remove("tmp.idx")
Exemple #11
0
def RepeatMasker_search(query_file, species, custom_lib_path=None, RepeatMasker_dir="",
                        num_of_threads=5, search_type="-s"):

    #species: see list of possible species in repeatmasker.help coming with RepeatMasker
    #search type: "-s" (sensetive), "" (default), "-q" (fast), "-qq" (very fast)

    repmask_dir = FileRoutines.check_path(RepeatMasker_dir)
    custom_lib = ""
    if custom_lib_path:
        cuatom_lib = "-lib %s" % custom_lib_path

    #additional options:
    #-xm    creates an additional output file in cross_match format (for parsing)
    #-ace   creates an additional output file in ACeDB format
    #-gff   creates an additional Gene Feature Finding format
    #-excln The percentages displayed in the .tbl file are calculated using a
    #       total sequence length excluding runs of 25 Ns or more.
    print("\nRepeatMasker search...\n")
    os.system(repmask_dir + "RepeatMasker -excln -xm -ace -gff %s -pa %i -species %s %s %s"
              % (custom_lib, num_of_threads, species, search_type, query_file))
Exemple #12
0
def windowmasker_search(windowmasker_dir):
    winmask_dir = FileRoutines.check_path(windowmasker_dir)
    #TODO: write this function
    pass
Exemple #13
0
def make_fasta_dict(fasta_file, dict_name, PICARD_dir=""):
    picard_dir = FileRoutines.check_path(PICARD_dir)
    os.system("java -jar %sCreateSequenceDictionary.jar R= %s O= %s" %
              (picard_dir, fasta_file, dict_name))
Exemple #14
0
def extract_repbase(species, output_file="RepBase.fasta", RepeatMaskerUtils_dir=""):
    print("\nExtracting RepBase for %s\n" % species)
    repmaskutils_dir = FileRoutines.check_path(RepeatMaskerUtils_dir)
    os.system(repmaskutils_dir + "queryRepeatDatabase.pl -species %s > %s" % (species, output_file))
Exemple #15
0
def rmout2gff3(rmoutfile, outfile, RepeatMaskerUtils_dir=""):
    repmaskutils_dir = FileRoutines.check_path(RepeatMaskerUtils_dir)
    os.system(repmaskutils_dir + "rmOutToGFF3.pl %s > %s" % (rmoutfile, outfile))
Exemple #16
0
    def align_samples(self,
                      samples_dir,
                      output_dir,
                      genome_dir,
                      genome_fasta=None,
                      samples=None,
                      annotation_gtf=None,
                      sjdboverhang=None,
                      genomeSAindexNbases=None,
                      genomeChrBinNbits=None,
                      genome_size=None,
                      feature_from_gtf_to_use_as_exon=None,
                      exon_tag_to_use_as_transcript_id=None,
                      exon_tag_to_use_as_gene_id=None,
                      length_of_sequences_flanking_junction=None,
                      junction_tab_file_list=None,
                      three_prime_trim=None,
                      five_prime_trim=None,
                      adapter_seq_for_three_prime_clip=None,
                      max_mismatch_percent_for_adapter_trimming=None,
                      three_prime_trim_after_adapter_clip=None,
                      output_type="BAM",
                      sort_bam=True,
                      max_memory_per_thread_for_bam_sorting="4G",
                      include_unmapped_reads_in_bam=True,
                      output_unmapped_reads=True,
                      two_pass_mode=True,
                      max_intron_length=None,
                      input_is_se=None,
                      filename_fragment_to_mark_se_reads=".se."):
        #STAR.threads = threads
        #STAR.path = star_dir

        if genome_fasta:
            STAR.index(genome_dir,
                       genome_fasta,
                       annotation_gtf=annotation_gtf,
                       junction_tab_file=junction_tab_file_list,
                       sjdboverhang=sjdboverhang,
                       genomeSAindexNbases=genomeSAindexNbases,
                       genomeChrBinNbits=genomeChrBinNbits,
                       genome_size=genome_size)

        sample_list = samples if samples else self.get_sample_list(samples_dir)

        FileRoutines.safe_mkdir(output_dir)

        for sample in sample_list:
            print("Handling %s" % sample)
            sample_dir = "%s/%s/" % (samples_dir, sample)
            alignment_sample_dir = "%s/%s/" % (output_dir, sample)
            FileRoutines.safe_mkdir(alignment_sample_dir)
            filetypes, forward_files, reverse_files, se_files = FileRoutines.make_lists_forward_and_reverse_files(
                sample_dir,
                filename_fragment_to_mark_se_reads=
                filename_fragment_to_mark_se_reads,
                input_is_se=input_is_se)
            #print (se_files)
            print("\tAligning reads...")

            self.align(
                genome_dir,
                forward_files if forward_files else se_files,
                reverse_read_list=reverse_files,
                annotation_gtf=annotation_gtf if not genome_fasta else None,
                feature_from_gtf_to_use_as_exon=feature_from_gtf_to_use_as_exon,
                exon_tag_to_use_as_transcript_id=
                exon_tag_to_use_as_transcript_id,
                exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id,
                length_of_sequences_flanking_junction=
                length_of_sequences_flanking_junction,
                junction_tab_file_list=junction_tab_file_list,
                three_prime_trim=three_prime_trim,
                five_prime_trim=five_prime_trim,
                adapter_seq_for_three_prime_clip=
                adapter_seq_for_three_prime_clip,
                max_mismatch_percent_for_adapter_trimming=
                max_mismatch_percent_for_adapter_trimming,
                three_prime_trim_after_adapter_clip=
                three_prime_trim_after_adapter_clip,
                output_type=output_type,
                sort_bam=sort_bam,
                max_memory_per_thread_for_bam_sorting=
                max_memory_per_thread_for_bam_sorting,
                include_unmapped_reads_in_bam=include_unmapped_reads_in_bam,
                output_unmapped_reads=output_unmapped_reads,
                output_dir=alignment_sample_dir,
                two_pass_mode=two_pass_mode,
                max_intron_length=max_intron_length)
Exemple #17
0
    def parallel_search_tandem_repeat(self, query_file, output_prefix, matching_weight=2, mismatching_penalty=7,
                                      indel_penalty=7,
                                      match_probability=80, indel_probability=10, min_alignment_score=50, max_period=500,
                                      report_flanking_sequences=False, splited_fasta_dir="splited_fasta_dir",
                                      splited_result_dir="splited_output", converted_output_dir="converted_output",
                                      max_len_per_file=100000, store_intermediate_files=False, max_repeat_length=None):
        work_dir = os.getcwd()
        splited_filename = FileRoutines.split_filename(query_file)
        self.split_fasta_by_seq_len(query_file, splited_fasta_dir, max_len_per_file=max_len_per_file,
                                    output_prefix=splited_filename[1])

        common_options = self.parse_common_options(matching_weight=matching_weight,
                                                   mismatching_penalty=mismatching_penalty,
                                                   indel_penalty=indel_penalty, match_probability=match_probability,
                                                   indel_probability=indel_probability,
                                                   min_alignment_score=min_alignment_score,
                                                   max_period=max_period,
                                                   report_flanking_sequences=report_flanking_sequences,
                                                   make_dat_file=True, max_repeat_length=max_repeat_length)
        common_options += " -h"  # suppress html output
        options_list = []
        splited_files = os.listdir(splited_fasta_dir)

        FileRoutines.safe_mkdir(splited_result_dir)
        FileRoutines.safe_mkdir(converted_output_dir)
        os.chdir(splited_result_dir)

        input_dir = splited_fasta_dir if (splited_fasta_dir[0] == "/") or (splited_fasta_dir[0] == "~") \
                    else "../%s" % splited_fasta_dir

        for filename in splited_files:
            file_options = "%s/%s" % (input_dir, filename)
            file_options += common_options
            options_list.append(file_options)

        self.parallel_execute(options_list)

        os.chdir(work_dir)
        trf_output_file_list = []
        for filename in splited_files:

            trf_output_file = "%s/%s.%i.%i.%i.%i.%i.%i.%i.dat" % (splited_result_dir, filename,
                                                                  matching_weight, mismatching_penalty,
                                                                  indel_penalty, match_probability,
                                                                  indel_probability,
                                                                  min_alignment_score, max_period)
            trf_output_file_list.append(trf_output_file)

        trf_report = self.convert_trf_report(trf_output_file_list, output_prefix)
        """
        for suffix in (".rep", ".gff", ".simple.gff", ".short.tab", ".wide.tab", ".with_rep_seqs.gff", ".fasta"):
            file_str = ""
            merged_file = "%s%s" % (output_prefix, suffix)
            for filename in splited_files:
                file_str += " %s/%s%s" % (converted_output_dir, filename, suffix)
            CGAS.cat(file_str, merged_file)
        """
        compress_splited_out_string = "tar czf %s.splited_output.tar.gz %s" % (output_prefix, splited_result_dir)
        os.system(compress_splited_out_string)

        if not store_intermediate_files:
            shutil.rmtree(splited_fasta_dir)
            shutil.rmtree(splited_result_dir)
            shutil.rmtree(converted_output_dir)

        return trf_report
Exemple #18
0
    def draw_window_density_distribution(self, count_dict, window_size, output_prefix=None, suptitle="SNP density distribution",
                                         density_multiplicator=1000,
                                         number_of_bins=None, width_of_bins=None,
                                         max_threshold=None, min_threshold=None,
                                         scaffold_black_list=[], scaffold_white_list=[],
                                         sort_scaffolds=False, scaffold_ordered_list=None, subplot_size=4,
                                         per_scaffold_histo_dir="per_scaffold_histo_dir/",
                                         subplot_tuple=None, share_x_axis=True, share_y_axis=True,
                                         extensions=("png",), show_mean_and_median=True):
        """
        scaffold_threshold: if number of scaffolds is higher draw only separated_histograms
        """
        samples_list = count_dict.keys()
        final_scaffold_list = self.get_filtered_scaffold_list(count_dict,
                                                              scaffold_black_list=scaffold_black_list,
                                                              sort_scaffolds=sort_scaffolds,
                                                              scaffold_ordered_list=scaffold_ordered_list,
                                                              scaffold_white_list=scaffold_white_list)

        scaffold_number = len(final_scaffold_list)

        FileRoutines.safe_mkdir(per_scaffold_histo_dir)

        xlabel = "Number of SNPs"
        ylabel = "Number of windows"

        scaled_count_dict = OrderedDict()

        empty_windows_scaffold_dict = OrderedDict()
        for scaffold in final_scaffold_list:
            for sample in count_dict:
                if scaffold not in count_dict[sample]:
                    continue
                empty_windows_scaffold_dict[scaffold] = np.zeros(len(count_dict[sample][scaffold]))
                break

        for sample in samples_list:
            scaled_count_dict[sample] = OrderedDict()
            for scaffold in final_scaffold_list:
                if scaffold not in count_dict[sample]:
                    scaled_count_dict[sample][scaffold] = empty_windows_scaffold_dict[scaffold]
                scaled_count_dict[sample][scaffold] = np.array(map(float, count_dict[sample][scaffold])) * density_multiplicator / window_size

        print("Drawing separated histograms for each scaffold...")
        extended_label_dict = OrderedDict()
        for scaffold in final_scaffold_list:
            print("Drawing histogram for scaffold %s" % scaffold)
            #scaffold_data = [scaled_count_dict[sample][scaffold] if scaffold in scaled_count_dict[sample] else empty_windows_scaffold_dict[scaffold] for sample in samples_list]
            scaffold_data = [scaled_count_dict[sample][scaffold] for sample in samples_list]

            out_prefix = "%s/%s.%s" % (per_scaffold_histo_dir, output_prefix, scaffold) if output_prefix else "%s/%s" % (per_scaffold_histo_dir, scaffold)
            for sample in samples_list:
                median = np.median(scaled_count_dict[sample][scaffold])
                mean = np.mean(scaled_count_dict[sample][scaffold])
                extended_label = "%s: Med. %.2f, Avg: %.2f" % (sample, float(median), float(mean))
                print(extended_label)
                if scaffold in extended_label_dict:
                    extended_label_dict[scaffold].append(extended_label)
                else:
                    extended_label_dict[scaffold] = [extended_label]
            #print scaffold_data
            self.draw_histogram(scaffold_data, output_prefix=out_prefix, number_of_bins=number_of_bins,
                                width_of_bins=width_of_bins,
                                max_threshold=max_threshold, min_threshold=min_threshold,
                                xlabel=xlabel, ylabel=ylabel,
                                title=scaffold, extensions=extensions, ylogbase=None, subplot=None, suptitle=None,
                                close_figure=True,
                                data_label_list=extended_label_dict[scaffold] if show_mean_and_median else samples_list)
        #print scaled_count_dict
        print("Drawing histograms for all scaffolds on same figure...")
        data = list(recursive_generator(scaled_count_dict))
        min_value = min(data) if data else 0
        max_value = max(data) if data else 0
        #print len(scaled_count_dict)
        #print data
        bin_array = self.generate_bin_array(data, y_list=None, bin_number=number_of_bins,
                                            bin_width=width_of_bins, bin_array=None,
                                            min_x_value=min_threshold, max_x_value=max_threshold, min_y_value=None,
                                            max_y_value=None, add_max_value=True)

        plt.suptitle(suptitle)
        if subplot_tuple is None:
            side = math.sqrt(scaffold_number)
            rounded_side = int(side)
            side = rounded_side + 1 if side % rounded_side else rounded_side
            subplot_tupleee = (side, side)
            #print subplot_tupleee
        else:
            subplot_tupleee = subplot_tuple

            if len(subplot_tupleee) != 2:
                raise ValueError("Subplot tuple should contain exactly two values, not %i!" % len(subplot_tuple))
            if not (isinstance(subplot_tuple[0], int) and isinstance(subplot_tuple[1], int)):
                raise ValueError("Subplot tuple should contain two values, not (%s, %s)!" % (str(type(subplot_tuple[0])),
                                                                                             str(type(subplot_tuple[1]))))

        figure = plt.figure(256, figsize=(subplot_size * subplot_tupleee[0],
                                          subplot_size * subplot_tupleee[1]),
                            dpi=200)
        print (subplot_size * subplot_tupleee[0], subplot_size * subplot_tupleee[1])
        number_of_subplots = subplot_tupleee[0] * subplot_tupleee[1]
        subplot_list = []
        for dataset_index in range(0, len(final_scaffold_list)):
            scaffold = final_scaffold_list[dataset_index]
            if dataset_index > 0:
                if share_x_axis and share_y_axis:
                    subplot_list.append(figure.add_subplot(subplot_tupleee[0],
                                                           subplot_tupleee[1],
                                                           dataset_index + 1,
                                                           sharex=subplot_list[0],
                                                           sharey=subplot_list[0]))
                elif share_x_axis:
                    subplot_list.append(figure.add_subplot(subplot_tupleee[0],
                                                           subplot_tupleee[1],
                                                           dataset_index + 1,
                                                           sharex=subplot_list[0]))
                elif share_y_axis:
                    subplot_list.append(figure.add_subplot(subplot_tupleee[0],
                                                           subplot_tupleee[1],
                                                           dataset_index + 1,
                                                           sharex=subplot_list[0],
                                                           sharey=subplot_list[0]))
                else:
                    subplot_list.append(figure.add_subplot(subplot_tupleee[0],
                                                           subplot_tupleee[1],
                                                           dataset_index + 1))
            else:
                subplot_list.append(figure.add_subplot(subplot_tupleee[0],
                                                       subplot_tupleee[1],
                                                       dataset_index + 1))
            #print dataset_index + 1
            #print subplot_tupleee[0] * (subplot_tupleee[1] - 1)
            #print ((dataset_index + 1) > (subplot_tupleee[0] * (subplot_tupleee[1] - 1)))

            histo = self.draw_histogram([scaled_count_dict[sample][scaffold] for sample in samples_list],
                                        number_of_bins=None,
                                        width_of_bins=None, max_threshold=None,
                                        min_threshold=None,
                                        xlabel=xlabel if ((dataset_index + 1) > (subplot_tupleee[0] * (subplot_tupleee[1] - 1))) else None,
                                        ylabel=ylabel if ((dataset_index + 1) % subplot_tupleee[0]) == 1 else None,
                                        title=scaffold, extensions=("png",), ylogbase=None,
                                        subplot=subplot_list[dataset_index],
                                        suptitle=None,
                                        data_label_list=extended_label_dict[scaffold] if show_mean_and_median else samples_list,
                                        bin_array=bin_array)
            plt.xlim(xmin=min_threshold if min_threshold and (min_threshold >= min_value) else min_value,
                     xmax=max_threshold if max_threshold and (max_threshold <= max_value) else max_value)
            #print histo
            """
            if output_prefix:
                output_histo_file = "%s.%s.%shisto" % (output_prefix,
                                                       dataset_index if parameters[8] is None else parameters[10],
                                                       ("log%i." % parameters[7]) if parameters[7] else "")
                np.savetxt(output_histo_file, histo, fmt="%i\t%i")
            """
        plt.tight_layout()
        if output_prefix:
            for ext in extensions:
                plt.savefig("%s.%s" % (output_prefix, ext))
        plt.close(figure)

        print("Drawing combined histogram for all scaffolds...")

        combined_count_dict = OrderedDict()
        extended_combined_label_list = []
        for sample in samples_list:
            combined_count_dict[sample] = []
            for scaffold in count_dict[sample]:
                combined_count_dict[sample] = combined_count_dict[sample] + count_dict[sample][scaffold]

            combined_count_dict[sample] = np.array(map(float, combined_count_dict[sample]))* density_multiplicator / window_size

            median = np.median(combined_count_dict[sample])
            mean = np.mean(combined_count_dict[sample])
            extended_label = "%s: Med. %.2f, Avg: %.2f" % (sample, float(median), float(mean))
            print(extended_label)
            extended_combined_label_list.append(extended_label)

        #print combined_count_dict
        figure = plt.figure(384, figsize=(8,8))
        self.draw_histogram([combined_count_dict[sample] for sample in combined_count_dict],
                            output_prefix="%s.combined" % output_prefix if output_prefix else "combined",
                            number_of_bins=number_of_bins,
                            width_of_bins=width_of_bins,
                            max_threshold=max_threshold, min_threshold=min_threshold,
                            xlabel=xlabel, ylabel=ylabel,
                            title="SNP density distribution(all scaffolds)",
                            extensions=extensions, ylogbase=None, subplot=None, suptitle=None,
                            close_figure=True, data_label_list=extended_combined_label_list if show_mean_and_median else samples_list)
Exemple #19
0
    def parallel_codeml(self,
                        in_dir,
                        tree_file,
                        out_dir,
                        seq_type="codons",
                        codon_frequency="F3X4",
                        noisy=0,
                        verbose="concise",
                        runmode=0,
                        clock=0,
                        aminoacid_distance=None,
                        model=1,
                        nssites=0,
                        genetic_code=0,
                        fix_kappa=False,
                        kappa=5,
                        fix_omega=False,
                        omega=0.2,
                        getSE=0,
                        RateAncestor=0,
                        small_difference=0.000001,
                        clean_data=True,
                        method=0,
                        Mgene=None):

        FileRoutines.safe_mkdir(out_dir)
        alignment_files_list = FileRoutines.make_list_of_path_to_files(in_dir)
        tree_file_abs_path = os.path.abspath(tree_file)
        options_list = []
        dir_list = []
        for filename in alignment_files_list:
            directory, basename, extension = FileRoutines.split_filename(
                filename)
            filename_out_dir = os.path.abspath("%s/%s/" % (out_dir, basename))
            out_file = "%s/%s.out" % (filename_out_dir, basename)
            ctl_file = "%s/%s.ctl" % (filename_out_dir, basename)

            options_list.append(ctl_file)
            dir_list.append(filename_out_dir)
            FileRoutines.safe_mkdir(filename_out_dir)
            self.generate_ctl_file(os.path.abspath(filename),
                                   tree_file_abs_path,
                                   out_file,
                                   ctl_file,
                                   seq_type=seq_type,
                                   codon_frequency=codon_frequency,
                                   noisy=noisy,
                                   verbose=verbose,
                                   runmode=runmode,
                                   clock=clock,
                                   aminoacid_distance=aminoacid_distance,
                                   model=model,
                                   nssites=nssites,
                                   genetic_code=genetic_code,
                                   fix_kappa=fix_kappa,
                                   kappa=kappa,
                                   fix_omega=fix_omega,
                                   omega=omega,
                                   getSE=getSE,
                                   RateAncestor=RateAncestor,
                                   Mgene=Mgene,
                                   small_difference=small_difference,
                                   clean_data=clean_data,
                                   method=method)
        self.parallel_execute(options_list, dir_list=dir_list)
Exemple #20
0
    def parallel_positive_selection_test(self,
                                         in_dir,
                                         tree_file,
                                         out_dir,
                                         results_file,
                                         seq_type="codons",
                                         codon_frequency="F3X4",
                                         noisy=3,
                                         verbose="concise",
                                         runmode=0,
                                         clock=0,
                                         aminoacid_distance=None,
                                         genetic_code=0,
                                         fix_kappa=False,
                                         kappa=5,
                                         getSE=0,
                                         RateAncestor=0,
                                         small_difference=0.000001,
                                         clean_data=True,
                                         method=0):
        """
        This function implements positive selection test (branch-site model)
        for branch labeled in tree file using model_A vs model_A_null(omega fixed to 1) comparison
        """

        FileRoutines.safe_mkdir(out_dir)
        alignment_files_list = FileRoutines.make_list_of_path_to_files(in_dir)
        tree_file_abs_path = os.path.abspath(tree_file)
        options_list = []
        dir_list = []
        basename_dir_list = []
        model_list = ["Model_A", "Model_A_null"]
        fix_omega_dict = {"Model_A": False, "Model_A_null": True}

        for filename in alignment_files_list:
            directory, basename, extension = FileRoutines.split_filename(
                filename)
            filename_out_dir = os.path.abspath("%s/%s/" % (out_dir, basename))
            basename_dir_list.append(basename)
            FileRoutines.safe_mkdir(filename_out_dir)

            for model in model_list:
                model_dir = "%s/%s/" % (filename_out_dir, model)
                FileRoutines.safe_mkdir(model_dir)
                out_file = "%s/%s/%s.out" % (filename_out_dir, model, basename)
                ctl_file = "%s/%s/%s.ctl" % (filename_out_dir, model, basename)

                options_list.append("%s.ctl" % basename)
                dir_list.append(model_dir)

                self.generate_ctl_file(os.path.abspath(filename),
                                       tree_file_abs_path,
                                       out_file,
                                       ctl_file,
                                       seq_type=seq_type,
                                       codon_frequency=codon_frequency,
                                       noisy=noisy,
                                       verbose=verbose,
                                       runmode=runmode,
                                       clock=clock,
                                       aminoacid_distance=aminoacid_distance,
                                       model=2,
                                       nssites=2,
                                       genetic_code=genetic_code,
                                       fix_kappa=fix_kappa,
                                       kappa=kappa,
                                       fix_omega=fix_omega_dict[model],
                                       omega=1,
                                       getSE=getSE,
                                       RateAncestor=RateAncestor,
                                       Mgene=0,
                                       small_difference=small_difference,
                                       clean_data=clean_data,
                                       method=method)

        self.parallel_execute(options_list, dir_list=dir_list)

        results_dict = OrderedDict()
        double_delta_dict = OrderedDict()
        raw_pvalues_dict = OrderedDict()
        raw_pvalues_list = []

        for basename in basename_dir_list:
            results_dict[basename] = OrderedDict()
            for model in model_list:
                output_file = "%s/%s/%s/%s.out" % (out_dir, basename, model,
                                                   basename)
                codeml_report = CodeMLReport(output_file)
                results_dict[basename][model] = codeml_report.LnL

        skipped_genes_set = set()
        for basename in basename_dir_list:
            for model in model_list:
                if results_dict[basename][model] is None:
                    print("LnL was not calculated for %s" % basename)
                    skipped_genes_set.add(basename)
                    break
            else:
                doubled_delta = 2 * (results_dict[basename]["Model_A"] -
                                     results_dict[basename]["Model_A_null"])
                p_value = chisqprob(doubled_delta, 1)  # degrees of freedom = 1

                double_delta_dict[basename] = doubled_delta
                raw_pvalues_dict[basename] = p_value
                raw_pvalues_list.append(p_value)

        adjusted_pvalues_list = fdrcorrection0(raw_pvalues_list)[1]
        #print adjusted_pvalues_list
        i = 0
        with open(results_file, "w") as out_fd:
            out_fd.write(
                "id\tmodel_a_null,LnL\tmodel_a,LnL\t2*delta\traw p-value\tadjusted p-value\n"
            )
            for basename in basename_dir_list:
                for model in model_list:
                    if results_dict[basename][model] is None:
                        print("LnL was not calculated for %s" % basename)
                        break
                else:
                    #doubled_delta = 2 * (results_dict[basename]["Model_A"] - results_dict[basename]["Model_A_null"])
                    #p_value = chisqprob(doubled_delta, 1) # degrees of freedom = 1

                    #print basename, results_dict[basename]["Model_A_null"],results_dict[basename]["Model_A"], double_delta_dict[basename], raw_pvalues_dict[basename], adjusted_pvalues_list[i]

                    out_fd.write(
                        "%s\t%f\t%f\t%f\t%f\t%f\n" %
                        (basename, results_dict[basename]["Model_A_null"],
                         results_dict[basename]["Model_A"],
                         double_delta_dict[basename],
                         raw_pvalues_dict[basename], adjusted_pvalues_list[i]))
                    i += 1
Exemple #21
0
    def parallel_predict(self,
                         species,
                         genome_file,
                         output,
                         strand="both",
                         gene_model=None,
                         output_gff3=True,
                         other_options="",
                         split_dir="splited_input",
                         splited_output_dir="splited_output_dir",
                         config_dir=None,
                         combine_output_to_single_file=True,
                         use_softmasking=None,
                         hints_file=None,
                         extrinsicCfgFile=None,
                         predict_UTR=None,
                         external_process_pool=None,
                         async_run=False,
                         min_intron_len=None,
                         parsing_mode="parse"):
        common_options = self.parse_options(species,
                                            genome_file="",
                                            strand=strand,
                                            gene_model=gene_model,
                                            output_gff3=output_gff3,
                                            other_options=other_options,
                                            config_dir=config_dir,
                                            use_softmasking=use_softmasking,
                                            hints_file=hints_file,
                                            extrinsicCfgFile=extrinsicCfgFile,
                                            predict_UTR=predict_UTR,
                                            min_intron_len=min_intron_len)

        splited_dir = FileRoutines.check_path(split_dir)
        splited_out_dir = FileRoutines.check_path(splited_output_dir)
        FileRoutines.safe_mkdir(splited_dir)
        FileRoutines.safe_mkdir(splited_out_dir)

        self.split_fasta_by_seq_len(genome_file,
                                    splited_dir,
                                    parsing_mode=parsing_mode)

        input_list_of_files = sorted(os.listdir(splited_dir))
        list_of_output_files = []
        options_list = []
        for filename in input_list_of_files:
            input_file = "%s%s" % (splited_dir, filename)
            output_file = "%s%s.gff" % (splited_out_dir, filename)
            list_of_output_files.append(output_file)
            options = common_options

            options += " %s" % input_file
            options += " > %s" % output_file
            options_list.append(options)

        self.parallel_execute(options_list,
                              external_process_pool=external_process_pool,
                              async_run=async_run)

        if combine_output_to_single_file:
            CGAS.cat(list_of_output_files, output=output)