Esempio n. 1
0
    def index(self,
              genome_dir,
              genome_fasta,
              annotation_gtf=None,
              junction_tab_file=None,
              sjdboverhang=None,
              genomeSAindexNbases=None,
              genomeChrBinNbits=None,
              genome_size=None):

        FileRoutines.safe_mkdir(genome_dir)

        options = "--runMode genomeGenerate"
        options += " --genomeDir %s" % os.path.abspath(genome_dir)
        options += " --runThreadN %i" % self.threads
        options += " --genomeFastaFiles %s" % (
            os.path.abspath(genome_fasta) if isinstance(genome_fasta, str) else
            " ".join(map(os.path.abspath, genome_fasta)))
        options += " --sjdbGTFfile %s" % annotation_gtf if annotation_gtf else ""
        options += " --sjdbFileChrStartEnd %s" % junction_tab_file if junction_tab_file else ""
        options += " --sjdbOverhang %i" % sjdboverhang if sjdboverhang else ""  # number of bases taken from both sides of splice junction. 100 by default
        if genome_size:
            options += " --genomeSAindexNbases %i" % min(
                [14, (floor(log(genome_size, 2) / 2)) - 1])
        else:
            options += " --genomeSAindexNbases %i" % genomeSAindexNbases if genomeSAindexNbases else ""  # size of k-mers used for preindexing of suffix array
        options += " --genomeChrBinNbits %i" % genomeChrBinNbits if genomeChrBinNbits else ""  # padding size (log2) of reference sequences. 18 by default
        # recommended value min(18, log2(GenomeLength/NumberOfScaffolds))
        self.execute(options)
Esempio n. 2
0
    def parallel_blast(self,
                       blast_command,
                       seqfile,
                       database,
                       outfile=None,
                       blast_options=None,
                       split_dir="splited_fasta",
                       splited_output_dir="splited_output_dir",
                       evalue=None,
                       output_format=None,
                       threads=None,
                       num_of_seqs_per_scan=None,
                       combine_output_to_single_file=True,
                       async_run=False,
                       external_process_pool=None):

        splited_dir = FileRoutines.check_path(split_dir)
        splited_out_dir = FileRoutines.check_path(splited_output_dir)
        self.safe_mkdir(splited_dir)
        self.safe_mkdir(splited_out_dir)

        number_of_files = num_of_seqs_per_scan if num_of_seqs_per_scan else 5 * threads if threads else 5 * self.threads
        self.split_fasta(seqfile, splited_dir, num_of_files=number_of_files)
        input_list_of_files = sorted(os.listdir(splited_dir))
        list_of_files = []

        for filename in input_list_of_files:
            filename_prefix = FileRoutines.split_filename(filename)[1]

            input_file = "%s%s" % (splited_dir, filename)
            output_file = "%s%s.hits" % (splited_out_dir, filename_prefix)

            list_of_files.append((input_file, output_file))

        options_list = []
        out_files = []

        for in_file, out_filename in list_of_files:

            options = " -out %s" % out_filename

            options += " -db %s" % database
            options += " -query %s" % in_file
            options += " %s" % blast_options if blast_options else ""
            options += " -evalue %s" % evalue if evalue else ""
            options += " -outfmt %i" % output_format if output_format else ""
            options_list.append(options)
            out_files.append(out_filename)

        self.parallel_execute(options_list,
                              cmd=blast_command,
                              threads=threads,
                              async_run=async_run,
                              external_process_pool=external_process_pool)

        if combine_output_to_single_file:
            CGAS.cat(out_files, output=outfile)
Esempio n. 3
0
    def parallel_align(self, list_of_files, output_dir, msa_tool='prank',
                       seq_type=None, bootstrap_number=100, genetic_code=1, threads=None,
                       msa_tool_options=None, seq_cutoff=None, col_cutoff=None, mafft_bin=None,
                       prank_bin=None, muscle_bin=None, pagan_bin=None, ruby_bin=None, program=None,
                       cmd_log_file=None,
                       cpus_per_task=1,
                       handling_mode="local",
                       job_name=None,
                       log_prefix=None,
                       error_log_prefix=None,
                       max_jobs=None,
                       max_running_time=None,
                       max_memory_per_node=None,
                       max_memmory_per_cpu=None,
                       modules_list=None,
                       environment_variables_dict=None):

        common_options = self.parse_common_options(output_dir=output_dir, msa_tool=msa_tool,
                                                   seq_type=seq_type, bootstrap_number=bootstrap_number,
                                                   genetic_code=genetic_code, threads=threads,
                                                   msa_tool_options=msa_tool_options, seq_cutoff=seq_cutoff,
                                                   col_cutoff=col_cutoff, mafft_bin=mafft_bin,
                                                   prank_bin=prank_bin, muscle_bin=muscle_bin,
                                                   pagan_bin=pagan_bin, ruby_bin=ruby_bin, program=program)

        FileRoutines.safe_mkdir(output_dir)
        options_list = []
        for filename in list_of_files:
            basename = FileRoutines.split_filename(filename)[1]
            op = common_options
            op += " --seqFile %s" % filename
            op += " --dataset %s" % basename
            options_list.append(op)
        if handling_mode == "local":
            self.parallel_execute(options_list)
        elif handling_mode == "slurm":

            cmd_list = ["%s%s %s" % ((self.path + "/") if self.path else "", self.cmd, options) for options in options_list]
            self.slurm_run_multiple_jobs_in_wrap_mode(cmd_list,
                                                      cmd_log_file,
                                                      max_jobs=max_jobs,
                                                      job_name=job_name,
                                                      log_prefix=log_prefix,
                                                      error_log_prefix=error_log_prefix,
                                                      cpus_per_node=None,
                                                      max_running_jobs=None,
                                                      max_running_time=max_running_time,
                                                      cpus_per_task=cpus_per_task,
                                                      max_memory_per_node=max_memory_per_node,
                                                      max_memmory_per_cpu=max_memmory_per_cpu,
                                                      modules_list=modules_list,
                                                      environment_variables_dict=environment_variables_dict)
Esempio n. 4
0
    def extract_single_copy_clusters_from_files(
            self,
            list_of_cluster_files,
            output_file,
            label_elements=False,
            separator="@",
            label_position="first",
            function_to_convert_filename_to_label=None):
        dict_of_cluster_dicts = OrderedDict()
        for filename in list_of_cluster_files:
            if function_to_convert_filename_to_label:
                label = function_to_convert_filename_to_label(filename)
            else:
                label = FileRoutines.split_filename(filename)[
                    1]  # use basename as label

            dict_of_cluster_dicts[label] = SynDict()
            dict_of_cluster_dicts[label].read(filename,
                                              split_values=True,
                                              comments_prefix="#")

        sc_clusters_dict = self.extract_single_copy_clusters(
            dict_of_cluster_dicts,
            label_elements=label_elements,
            separator=separator,
            label_position=label_position)

        sc_clusters_dict.write(output_file, splited_values=True)

        return sc_clusters_dict
Esempio n. 5
0
    def parallel_align(self,
                       list_of_files,
                       output_directory,
                       output_suffix="alignment",
                       gap_open_penalty=None,
                       offset=None,
                       maxiterate=None,
                       quiet=False,
                       mode="globalpair",
                       number_of_processes=1,
                       anysymbol=False):
        # TODO: add rest of options

        options = " --thread %i" % self.threads
        options += " --op %f" % gap_open_penalty if gap_open_penalty is not None else ""
        options += " --ep %f" % offset if offset is not None else ""
        options += " --maxiterate %i" % maxiterate if maxiterate is not None else ""
        options += " --quiet" if quiet else ""
        options += " --%s" % mode
        options += " --anysymbol" if anysymbol else ""
        options_list = []
        for filename in list_of_files:
            basename = FileRoutines.split_filename(filename)[1]
            op = options
            op += " %s" % filename
            op += " > %s/%s.fasta" % (output_directory,
                                      ("%s_%s" % (basename, output_suffix))
                                      if output_suffix else basename)
            options_list.append(op)

        self.parallel_execute(options_list, threads=number_of_processes)
Esempio n. 6
0
    def extract_proteins_from_alignments(dir_with_alignments, output_dir):
        out_dir = FileRoutines.check_path(output_dir)

        print type(FileRoutines)

        input_files = make_list_of_path_to_files(
            [dir_with_alignments] if isinstance(dir_with_alignments, str
                                                ) else dir_with_alignments)

        FileRoutines.safe_mkdir(out_dir)
        from Routines import MultipleAlignmentRoutines
        for filename in input_files:
            filename_list = FileRoutines.split_filename(filename)
            output_file = "%s%s%s" % (out_dir, filename_list[1],
                                      filename_list[2])
            MultipleAlignmentRoutines.extract_sequences_from_alignment(
                filename, output_file)
Esempio n. 7
0
 def read_cluster_files_from_dir(dir_with_cluster_files):
     cluster_files_list = sorted(os.listdir(dir_with_cluster_files))
     clusters_dict = OrderedDict()
     for filename in cluster_files_list:
         filepath = "%s%s" % (
             FileRoutines.check_path(dir_with_cluster_files), filename)
         filename_list = FileRoutines.split_filename(filepath)
         clusters_dict[filename_list[1]] = SynDict()
         clusters_dict[filename_list[1]].read(filepath,
                                              header=False,
                                              separator="\t",
                                              allow_repeats_of_key=False,
                                              split_values=True,
                                              values_separator=",",
                                              key_index=0,
                                              value_index=1,
                                              comments_prefix="#")
     return clusters_dict
Esempio n. 8
0
    def parallel_align(self,
                       list_of_files,
                       output_directory,
                       output_suffix=None,
                       tree_file=None,
                       output_format=None,
                       show_xml=None,
                       show_tree=None,
                       show_ancestral_sequences=None,
                       show_evolutionary_events=None,
                       showall=None,
                       compute_posterior_support=None,
                       njtree=None,
                       skip_insertions=False,
                       codon_alignment=None,
                       translated_alignment=None):

        common_options = self.parse_common_options(
            tree_file=tree_file,
            output_format=output_format,
            show_xml=show_xml,
            show_tree=show_tree,
            show_ancestral_sequences=show_ancestral_sequences,
            show_evolutionary_events=show_evolutionary_events,
            showall=showall,
            compute_posterior_support=compute_posterior_support,
            njtree=njtree,
            skip_insertions=skip_insertions,
            codon_alignment=codon_alignment,
            translated_alignment=translated_alignment)

        FileRoutines.safe_mkdir(output_directory)
        options_list = []
        for filename in list_of_files:
            basename = FileRoutines.split_filename(filename)[1]
            op = common_options
            op += " -d=%s" % filename
            op += " -o=%s/%s.fasta" % (output_directory,
                                       ("%s_%s" % (basename, output_suffix))
                                       if output_suffix else basename)
            options_list.append(op)

        self.parallel_execute(options_list)
Esempio n. 9
0
    def parallel_predict(self, species, genome_file, output, strand="both", gene_model=None, output_gff3=True,
                         other_options="", split_dir="splited_input", splited_output_dir="splited_output_dir",
                         config_dir=None, combine_output_to_single_file=True, use_softmasking=None, hints_file=None,
                         extrinsicCfgFile=None, predict_UTR=None, external_process_pool=None,
                         async_run=False, min_intron_len=None, parsing_mode="parse"):
        common_options = self.parse_options(species, genome_file="", strand=strand, gene_model=gene_model,
                                            output_gff3=output_gff3, other_options=other_options,
                                            config_dir=config_dir, use_softmasking=use_softmasking,
                                            hints_file=hints_file, extrinsicCfgFile=extrinsicCfgFile,
                                            predict_UTR=predict_UTR, min_intron_len=min_intron_len)

        splited_dir = FileRoutines.check_path(split_dir)
        splited_out_dir = FileRoutines.check_path(splited_output_dir)
        FileRoutines.safe_mkdir(splited_dir)
        FileRoutines.safe_mkdir(splited_out_dir)

        self.split_fasta_by_seq_len(genome_file, splited_dir, parsing_mode=parsing_mode)

        input_list_of_files = sorted(os.listdir(splited_dir))
        list_of_output_files = []
        options_list = []
        for filename in input_list_of_files:
            input_file = "%s%s" % (splited_dir, filename)
            output_file = "%s%s.gff" % (splited_out_dir, filename)
            list_of_output_files.append(output_file)
            options = common_options

            options += " %s" % input_file
            options += " > %s" % output_file
            options_list.append(options)

        self.parallel_execute(options_list, external_process_pool=external_process_pool, async_run=async_run)

        if combine_output_to_single_file:
            CGAS.cat(list_of_output_files, output=output)
Esempio n. 10
0
    def extract_proteins_from_selected_families(
            families_id_file,
            fam_file,
            pep_file,
            output_dir="./",
            pep_format="fasta",
            out_prefix=None,
            create_dir_for_each_family=False):
        from Routines import SequenceRoutines, FileRoutines
        fam_id_list = IdList()
        fam_dict = SynDict()
        #print(pep_file)
        FileRoutines.safe_mkdir(output_dir)
        out_dir = FileRoutines.check_path(output_dir)
        create_directory_for_each_family = True if out_prefix else create_dir_for_each_family
        if families_id_file:
            fam_id_list.read(families_id_file)
        fam_dict.read(fam_file, split_values=True, values_separator=",")
        protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format)

        for fam_id in fam_id_list if families_id_file else fam_dict:
            if fam_id in fam_dict:
                if create_directory_for_each_family:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    FileRoutines.safe_mkdir(fam_dir)
                    out_file = "%s%s.pep" % (fam_dir, out_prefix
                                             if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.pep" % (out_dir, out_prefix
                                              if out_prefix else fam_id)

                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    protein_dict, fam_dict[fam_id], verbose=True),
                            out_file,
                            format=pep_format)
            else:
                print("%s was not found" % fam_id)

        os.remove("tmp.idx")
Esempio n. 11
0
    def extract_sequences_from_selected_clusters(
            self,
            clusters_id_file,
            cluster_file,
            seq_file,
            output_dir="./",
            seq_format="fasta",
            out_prefix=None,
            create_dir_for_each_cluster=False,
            skip_cluster_if_no_sequence_for_element=True):
        from Routines import SequenceRoutines, FileRoutines
        cluster_id_list = IdList()
        cluster_dict = SynDict()
        #print(pep_file)
        FileRoutines.safe_mkdir(output_dir)
        out_dir = FileRoutines.check_path(output_dir)
        create_directory_for_each_cluster = True if out_prefix else create_dir_for_each_cluster
        if clusters_id_file:
            cluster_id_list.read(clusters_id_file)
        cluster_dict.read(cluster_file,
                          split_values=True,
                          values_separator=",")
        protein_dict = SeqIO.index_db(
            "tmp.idx",
            FileRoutines.make_list_of_path_to_files(seq_file),
            format=seq_format)

        number_of_skipped_clusters = 0
        for fam_id in cluster_id_list if clusters_id_file else cluster_dict:

            if skip_cluster_if_no_sequence_for_element:
                absent_elements = self.check_absence_of_cluster_elements(
                    cluster_dict[fam_id], protein_dict)
                if absent_elements:
                    print "Skipping cluster %s due to absent element(%s)" % (
                        fam_id, ",".join(absent_elements))
                    number_of_skipped_clusters += 1
                    continue

            if fam_id in cluster_dict:
                if create_directory_for_each_cluster:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    FileRoutines.safe_mkdir(fam_dir)
                    out_file = "%s%s.fasta" % (fam_dir, out_prefix
                                               if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.fasta" % (out_dir, out_prefix
                                                if out_prefix else fam_id)

                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    protein_dict, cluster_dict[fam_id], verbose=True),
                            out_file,
                            format=seq_format)

        os.remove("tmp.idx")
        print "%i of %i clusters were skipped due to absent elements" % (
            number_of_skipped_clusters, len(cluster_dict))

        return number_of_skipped_clusters
Esempio n. 12
0
    def split_proteins_per_species(dir_with_proteins,
                                   output_dir,
                                   input_format="fasta",
                                   output_format="fasta"):
        input_files = FileRoutines.make_list_of_path_to_files(
            [dir_with_proteins] if isinstance(dir_with_proteins, str
                                              ) else dir_with_proteins)

        out_dir = FileRoutines.check_path(output_dir)
        FileRoutines.safe_mkdir(out_dir)

        protein_dict = SeqIO.index_db("temp.idx",
                                      input_files,
                                      format=input_format)

        syn_dict = SynDict()

        for protein_id in protein_dict:
            taxa_id = protein_id.split(".")[0]
            # pep_id = ".".join(tmp_list[1:])
            if taxa_id not in syn_dict:
                syn_dict[taxa_id] = []
            syn_dict[taxa_id].append(protein_id)

        def renamed_records_generator(record_dict, taxa_id):
            for record_id in syn_dict[taxa_id]:
                record = deepcopy(record_dict[record_id])
                #print(record)
                record.id = ".".join(record_id.split(".")[1:])
                yield record

        for taxa_id in syn_dict:
            out_file = "%s%s.pep" % (out_dir, taxa_id)
            SeqIO.write(renamed_records_generator(protein_dict, taxa_id),
                        out_file,
                        format=output_format)
Esempio n. 13
0
    def parallel_positive_selection_test(self,
                                         in_dir,
                                         tree_file,
                                         out_dir,
                                         results_file,
                                         seq_type="codons",
                                         codon_frequency="F3X4",
                                         noisy=3,
                                         verbose="concise",
                                         runmode=0,
                                         clock=0,
                                         aminoacid_distance=None,
                                         genetic_code=0,
                                         fix_kappa=False,
                                         kappa=5,
                                         getSE=0,
                                         RateAncestor=0,
                                         small_difference=0.000001,
                                         clean_data=True,
                                         method=0):
        """
        This function implements positive selection test (branch-site model)
        for branch labeled in tree file using model_A vs model_A_null(omega fixed to 1) comparison
        """

        FileRoutines.safe_mkdir(out_dir)
        alignment_files_list = FileRoutines.make_list_of_path_to_files(in_dir)
        tree_file_abs_path = os.path.abspath(tree_file)
        options_list = []
        dir_list = []
        basename_dir_list = []
        model_list = ["Model_A", "Model_A_null"]
        fix_omega_dict = {"Model_A": False, "Model_A_null": True}

        for filename in alignment_files_list:
            directory, basename, extension = FileRoutines.split_filename(
                filename)
            filename_out_dir = os.path.abspath("%s/%s/" % (out_dir, basename))
            basename_dir_list.append(basename)
            FileRoutines.safe_mkdir(filename_out_dir)

            for model in model_list:
                model_dir = "%s/%s/" % (filename_out_dir, model)
                FileRoutines.safe_mkdir(model_dir)
                out_file = "%s/%s/%s.out" % (filename_out_dir, model, basename)
                ctl_file = "%s/%s/%s.ctl" % (filename_out_dir, model, basename)

                options_list.append("%s.ctl" % basename)
                dir_list.append(model_dir)

                self.generate_ctl_file(os.path.abspath(filename),
                                       tree_file_abs_path,
                                       out_file,
                                       ctl_file,
                                       seq_type=seq_type,
                                       codon_frequency=codon_frequency,
                                       noisy=noisy,
                                       verbose=verbose,
                                       runmode=runmode,
                                       clock=clock,
                                       aminoacid_distance=aminoacid_distance,
                                       model=2,
                                       nssites=2,
                                       genetic_code=genetic_code,
                                       fix_kappa=fix_kappa,
                                       kappa=kappa,
                                       fix_omega=fix_omega_dict[model],
                                       omega=1,
                                       getSE=getSE,
                                       RateAncestor=RateAncestor,
                                       Mgene=0,
                                       small_difference=small_difference,
                                       clean_data=clean_data,
                                       method=method)

        self.parallel_execute(options_list, dir_list=dir_list)

        results_dict = OrderedDict()
        double_delta_dict = OrderedDict()
        raw_pvalues_dict = OrderedDict()
        raw_pvalues_list = []

        for basename in basename_dir_list:
            results_dict[basename] = OrderedDict()
            for model in model_list:
                output_file = "%s/%s/%s/%s.out" % (out_dir, basename, model,
                                                   basename)
                codeml_report = CodeMLReport(output_file)
                results_dict[basename][model] = codeml_report.LnL

        skipped_genes_set = set()
        for basename in basename_dir_list:
            for model in model_list:
                if results_dict[basename][model] is None:
                    print("LnL was not calculated for %s" % basename)
                    skipped_genes_set.add(basename)
                    break
            else:
                doubled_delta = 2 * (results_dict[basename]["Model_A"] -
                                     results_dict[basename]["Model_A_null"])
                p_value = chisqprob(doubled_delta, 1)  # degrees of freedom = 1

                double_delta_dict[basename] = doubled_delta
                raw_pvalues_dict[basename] = p_value
                raw_pvalues_list.append(p_value)

        adjusted_pvalues_list = fdrcorrection0(raw_pvalues_list)[1]
        #print adjusted_pvalues_list
        i = 0
        with open(results_file, "w") as out_fd:
            out_fd.write(
                "id\tmodel_a_null,LnL\tmodel_a,LnL\t2*delta\traw p-value\tadjusted p-value\n"
            )
            for basename in basename_dir_list:
                for model in model_list:
                    if results_dict[basename][model] is None:
                        print("LnL was not calculated for %s" % basename)
                        break
                else:
                    #doubled_delta = 2 * (results_dict[basename]["Model_A"] - results_dict[basename]["Model_A_null"])
                    #p_value = chisqprob(doubled_delta, 1) # degrees of freedom = 1

                    #print basename, results_dict[basename]["Model_A_null"],results_dict[basename]["Model_A"], double_delta_dict[basename], raw_pvalues_dict[basename], adjusted_pvalues_list[i]

                    out_fd.write(
                        "%s\t%f\t%f\t%f\t%f\t%f\n" %
                        (basename, results_dict[basename]["Model_A_null"],
                         results_dict[basename]["Model_A"],
                         double_delta_dict[basename],
                         raw_pvalues_dict[basename], adjusted_pvalues_list[i]))
                    i += 1
Esempio n. 14
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'

import argparse

from Routines import MultipleAlignmentRoutines, FileRoutines

parser = argparse.ArgumentParser()

parser.add_argument(
    "-i",
    "--input",
    action="store",
    dest="input",
    required=True,
    type=lambda s: FileRoutines.make_list_of_path_to_files(s.split(",")),
    help="Comma-separated list of files/directories with alignments")
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    required=True,
                    help="File to write merged alignment")
parser.add_argument(
    "-c",
    "--coordinates_file",
    action="store",
    dest="coords_file",
    required=True,
    help="File to write file with coordinates of alignments in merged alignment"
)
Esempio n. 15
0
                    action="store",
                    dest="max_memory_per_thread",
                    default="1G",
                    help="Maximum memory per thread. Default - 1G")
args = parser.parse_args()

if args.prepare_bam and ((not args.prepared_bam_prefix) or
                         (not args.temp_dir)):
    raise ValueError(
        "Options -e/--prepared_bam_prefix and -m/--temp_dir must be set if -p/--prepare_bam option is used"
    )

SamtoolsV1.threads = args.threads

if args.prepare_bam or args.mix_ends:
    FileRoutines.safe_mkdir(FileRoutines.check_path(args.temp_dir))
    prepared_pe_bam_file = "%s.bam" % args.prepared_bam_prefix
    prepared_unpaired_bam_file = (
        "%s.unpaired.bam" %
        args.prepared_bam_prefix) if args.mix_ends else None
    """
    SamtoolsV1.prepare_bam_for_read_extraction(args.input, args.prepared_bam, temp_file_prefix=args.temp_dir,
                                               max_memory_per_thread=args.max_memory_per_thread)
    """
    SamtoolsV1.prepare_bam_for_read_extraction(
        args.input,
        prepared_pe_bam_file,
        temp_file_prefix=args.temp_dir,
        max_memory_per_thread=args.max_memory_per_thread,
        bam_file_to_write_unpaired_reads=prepared_unpaired_bam_file)
if args.paired:
Esempio n. 16
0
parser.add_argument("-i", "--input_file_list", action="store", dest="input", required=True,
                    type=FileRoutines.make_list_of_path_to_files_from_string,
                    help="Comma-separated list of input files/directories with sequences")
parser.add_argument("-o", "--output_directory", action="store", dest="output", type=FileRoutines.check_path,
                    help="Directory to output groups_of sequences")
parser.add_argument("-f", "--format", action="store", dest="format", default="fasta",
                    help="Format of input and output files. Allowed formats genbank, fasta(default)")
parser.add_argument("-e", "--extension", action="store", dest="extension",
                    help="Extension of output files. Default: equal to -f")
parser.add_argument("-d", "--id_file", action="store", dest="id_file",
                    help="File with groups of sequences to extract(.fam file).")

args = parser.parse_args()

FileRoutines.safe_mkdir(args.output)
args.extension = args.extension if args.extension else args.format
tmp_index_file = "temp.idx"

#id_list = read_ids(args.id_file)
id_list = IdSet(filename=args.id_file)

sequence_groups_id = SynDict()
sequence_groups_id.read(args.id_file, split_values=True)
#print("Parsing %s..." % args.input_file)
sequence_dict = SeqIO.index_db(tmp_index_file, args.input, format=args.format)
for group in sequence_groups_id:
    SeqIO.write(SequenceRoutines.record_by_id_generator(sequence_dict, sequence_groups_id[group],
                                                        verbose=True),
                "%s%s.%s" % (args.output, group, args.extension), format=args.format)
Esempio n. 17
0
parser.add_argument("--indel_InbreedingCoeff",
                    action="store",
                    dest="indel_InbreedingCoeff",
                    type=float,
                    default=-0.8,
                    help="Indel InbreedingCoeff threshold. Default -   -0.8")
parser.add_argument("--indel_FS",
                    action="store",
                    dest="indel_FS",
                    type=float,
                    default=200.0,
                    help="Indel FS threshold. Default - 200.0")

args = parser.parse_args()

VariantFiltration.jar_path = FileRoutines.check_path(args.gatk_dir)

VariantFiltration.filter_bad_variants(
    args.reference,
    args.input_vcf,
    args.output_prefix,
    snp_filter_name=args.snp_filter_name,
    snp_QD=args.snp_QD,
    snp_FS=args.snp_FS,
    snp_MQ=args.snp_MQ,
    snp_HaplotypeScore=args.snp_HaplotypeScore,
    snp_MappingQualityRankSum=args.snp_MappingQualityRankSum,
    snp_ReadPosRankSum=args.snp_ReadPosRankSum,
    indel_filter_name=args.indel_filter_name,
    indel_QD=args.indel_QD,
    indel_ReadPosRankSum=args.indel_ReadPosRankSum,
Esempio n. 18
0
    def parallel_align(self,
                       list_of_files,
                       output_directory,
                       output_suffix=None,
                       tree_file=None,
                       output_format=None,
                       show_xml=None,
                       show_tree=None,
                       show_ancestral_sequences=None,
                       show_evolutionary_events=None,
                       showall=None,
                       compute_posterior_support=None,
                       njtree=None,
                       skip_insertions=False,
                       codon_alignment=None,
                       translated_alignment=None,
                       cmd_log_file=None,
                       cpus_per_task=1,
                       handling_mode="local",
                       job_name=None,
                       log_prefix=None,
                       error_log_prefix=None,
                       max_jobs=None,
                       max_running_time=None,
                       max_memory_per_node=None,
                       max_memmory_per_cpu=None,
                       modules_list=None,
                       environment_variables_dict=None):

        common_options = self.parse_common_options(
            tree_file=tree_file,
            output_format=output_format,
            show_xml=show_xml,
            show_tree=show_tree,
            show_ancestral_sequences=show_ancestral_sequences,
            show_evolutionary_events=show_evolutionary_events,
            showall=showall,
            compute_posterior_support=compute_posterior_support,
            njtree=njtree,
            skip_insertions=skip_insertions,
            codon_alignment=codon_alignment,
            translated_alignment=translated_alignment)

        FileRoutines.safe_mkdir(output_directory)
        options_list = []
        for filename in list_of_files:
            basename = FileRoutines.split_filename(filename)[1]
            op = common_options
            op += " -d=%s" % filename
            op += " -o=%s/%s.fasta" % (output_directory,
                                       ("%s_%s" % (basename, output_suffix))
                                       if output_suffix else basename)
            options_list.append(op)
        if handling_mode == "local":
            self.parallel_execute(options_list)
        elif handling_mode == "slurm":

            cmd_list = [
                "%s%s %s" %
                ((self.path + "/") if self.path else "", self.cmd, options)
                for options in options_list
            ]
            self.slurm_run_multiple_jobs_in_wrap_mode(
                cmd_list,
                cmd_log_file,
                max_jobs=max_jobs,
                job_name=job_name,
                log_prefix=log_prefix,
                error_log_prefix=error_log_prefix,
                cpus_per_node=None,
                max_running_jobs=None,
                max_running_time=max_running_time,
                cpus_per_task=cpus_per_task,
                max_memory_per_node=max_memory_per_node,
                max_memmory_per_cpu=max_memmory_per_cpu,
                modules_list=modules_list,
                environment_variables_dict=environment_variables_dict)
Esempio n. 19
0
    def parallel_search_tandem_repeat(self,
                                      query_file,
                                      output_prefix,
                                      matching_weight=2,
                                      mismatching_penalty=7,
                                      indel_penalty=7,
                                      match_probability=80,
                                      indel_probability=10,
                                      min_alignment_score=50,
                                      max_period=500,
                                      report_flanking_sequences=False,
                                      splited_fasta_dir="splited_fasta_dir",
                                      splited_result_dir="splited_output",
                                      converted_output_dir="converted_output",
                                      max_len_per_file=100000,
                                      store_intermediate_files=False):
        work_dir = os.getcwd()
        splited_filename = FileRoutines.split_filename(query_file)
        self.split_fasta_by_seq_len(query_file,
                                    splited_fasta_dir,
                                    max_len_per_file=max_len_per_file,
                                    output_prefix=splited_filename[1])

        common_options = self.parse_common_options(
            matching_weight=matching_weight,
            mismatching_penalty=mismatching_penalty,
            indel_penalty=indel_penalty,
            match_probability=match_probability,
            indel_probability=indel_probability,
            min_alignment_score=min_alignment_score,
            max_period=max_period,
            report_flanking_sequences=report_flanking_sequences,
            make_dat_file=True)
        common_options += " -h"  # suppress html output
        options_list = []
        splited_files = os.listdir(splited_fasta_dir)

        FileRoutines.safe_mkdir(splited_result_dir)
        FileRoutines.safe_mkdir(converted_output_dir)
        os.chdir(splited_result_dir)

        input_dir = splited_fasta_dir if (splited_fasta_dir[0] == "/") or (splited_fasta_dir[0] == "~") \
                    else "../%s" % splited_fasta_dir

        for filename in splited_files:
            file_options = "%s/%s" % (input_dir, filename)
            file_options += common_options
            options_list.append(file_options)

        self.parallel_execute(options_list)

        os.chdir(work_dir)
        trf_output_file_list = []
        for filename in splited_files:

            trf_output_file = "%s/%s.%i.%i.%i.%i.%i.%i.%i.dat" % (
                splited_result_dir, filename, matching_weight,
                mismatching_penalty, indel_penalty, match_probability,
                indel_probability, min_alignment_score, max_period)
            trf_output_file_list.append(trf_output_file)

        trf_report = self.convert_trf_report(trf_output_file_list,
                                             output_prefix)
        """
        for suffix in (".rep", ".gff", ".simple.gff", ".short.tab", ".wide.tab", ".with_rep_seqs.gff", ".fasta"):
            file_str = ""
            merged_file = "%s%s" % (output_prefix, suffix)
            for filename in splited_files:
                file_str += " %s/%s%s" % (converted_output_dir, filename, suffix)
            CGAS.cat(file_str, merged_file)
        """
        if not store_intermediate_files:
            shutil.rmtree(splited_fasta_dir)
            shutil.rmtree(splited_result_dir)
            shutil.rmtree(converted_output_dir)

        return trf_report
Esempio n. 20
0
STAR.path = args.star_dir

if args.genome_fasta:
    STAR.index(args.genome_dir,
               args.genome_fasta,
               annotation_gtf=args.annotation_gtf,
               junction_tab_file=args.junction_tab_file,
               sjdboverhang=None,
               genomeSAindexNbases=None,
               genomeChrBinNbits=None,
               genome_size=args.genome_size)

sample_list = args.samples if args.samples else Pipeline.get_sample_list(
    args.samples_dir)

FileRoutines.safe_mkdir(args.output_dir)

for sample in sample_list:
    print("Handling %s" % sample)
    sample_dir = "%s/%s/" % (args.samples_dir, sample)
    alignment_sample_dir = "%s/%s/" % (args.output_dir, sample)
    FileRoutines.safe_mkdir(alignment_sample_dir)
    filetypes, forward_files, reverse_files = FileRoutines.make_lists_forward_and_reverse_files(
        sample_dir)

    print "\tAligning reads..."

    STAR.align(
        args.genome_dir,
        forward_files,
        reverse_read_list=reverse_files,
Esempio n. 21
0
AUGUSTUS.extract_CDS_annotations_from_output(final_gff, final_CDS_gff)

for stat_file in output_evidence_stats, output_supported_stats, \
                 output_swissprot_pfam_or_hints_supported_transcripts_longest_pep_evidence, \
                 output_swissprot_pfam_and_hints_supported_transcripts_longest_pep_evidence, \
                 output_swissprot_pfam_or_hints_supported_transcripts_evidence, \
                 output_swissprot_pfam_and_hints_supported_transcripts_evidence:

    MatplotlibRoutines.percent_histogram_from_file(
        stat_file,
        stat_file,
        data_type=None,
        column_list=(2, ),
        comments="#",
        n_bins=20,
        title="Transcript support by hints",
        extensions=("png", "svg"),
        legend_location="upper center",
        stats_as_legend=True)

if args.pfam_db and args.swissprot_db:
    db_or_hints_dir = "supported_by_db_or_hints/"
    db_and_hints_dir = "supported_by_db_and_hints/"
    for directory in db_and_hints_dir, db_or_hints_dir:
        FileRoutines.safe_mkdir(directory)

    os.system("mv %s.supported.transcripts.swissprot_or_pfam_or_hints* %s" %
              (args.output, db_or_hints_dir))
    os.system("mv %s.supported.transcripts.swissprot_or_pfam_and_hints* %s" %
              (args.output, db_and_hints_dir))
Esempio n. 22
0
 def __init__(self):
     FileRoutines.__init__(self)
Esempio n. 23
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'

import argparse

from Routines import MultipleAlignmentRoutines, FileRoutines

parser = argparse.ArgumentParser()

parser.add_argument(
    "-i",
    "--input",
    action="store",
    dest="input",
    required=True,
    type=lambda s: FileRoutines.make_list_of_path_to_files(s.split(",")),
    help="Comma-separated list of files/directories with alignments")
parser.add_argument(
    "-o",
    "--output_directory",
    action="store",
    dest="output_dir",
    default="./",
    help=
    "Output directory to write resulting files. Default - current directory")
parser.add_argument("-f",
                    "--format",
                    action="store",
                    dest="format",
                    default="fasta",
                    help="Format of alignments")
Esempio n. 24
0
                    "--output",
                    action="store",
                    dest="output",
                    required=True,
                    help="File to write clusters with single-copy clusters")
parser.add_argument(
    "-p",
    "--label position",
    action="store",
    dest="label_position",
    default="first",
    help="Position of label. Allowed - first, last. Default - first")
parser.add_argument("-s",
                    "--separator",
                    action="store",
                    dest="separator",
                    default="@",
                    help="Separator to use. default - '@'")

args = parser.parse_args()

list_of_cluster_files = FileRoutines.make_list_of_path_to_files(args.input)

single_copy_clusters = SequenceClusterRoutines.extract_single_copy_clusters_from_files(
    list_of_cluster_files,
    args.output,
    label_elements=args.label,
    separator=args.separator,
    label_position=args.label_position)

print "Was found %i single-copy clusters" % len(single_copy_clusters)
Esempio n. 25
0
                    "--output",
                    action="store",
                    dest="output",
                    required=True,
                    help="File to write clusters with labeled elements")
parser.add_argument(
    "-p",
    "--label position",
    action="store",
    dest="label_position",
    default="first",
    help="Position of label. Allowed - first, last. Default - first")
parser.add_argument("-s",
                    "--separator",
                    action="store",
                    dest="separator",
                    default="@",
                    help="Separator to use. default - '@'")

args = parser.parse_args()

label = args.label if args.label else FileRoutines.split_filename(
    args.cluster_file)[1]

SequenceClusterRoutines.label_cluster_elements_from_file(
    args.cluster_file,
    label,
    args.output,
    separator=args.separator,
    label_position=args.label_position)
Esempio n. 26
0
    def align_samples(self,
                      samples_dir,
                      output_dir,
                      genome_dir,
                      genome_fasta=None,
                      samples=None,
                      annotation_gtf=None,
                      sjdboverhang=None,
                      genomeSAindexNbases=None,
                      genomeChrBinNbits=None,
                      genome_size=None,
                      feature_from_gtf_to_use_as_exon=None,
                      exon_tag_to_use_as_transcript_id=None,
                      exon_tag_to_use_as_gene_id=None,
                      length_of_sequences_flanking_junction=None,
                      junction_tab_file_list=None,
                      three_prime_trim=None,
                      five_prime_trim=None,
                      adapter_seq_for_three_prime_clip=None,
                      max_mismatch_percent_for_adapter_trimming=None,
                      three_prime_trim_after_adapter_clip=None,
                      output_type="BAM",
                      sort_bam=True,
                      max_memory_for_bam_sorting=8000000000,
                      include_unmapped_reads_in_bam=True,
                      output_unmapped_reads=True,
                      two_pass_mode=True,
                      max_intron_length=None):
        #STAR.threads = threads
        #STAR.path = star_dir

        if genome_fasta:
            STAR.index(genome_dir,
                       genome_fasta,
                       annotation_gtf=annotation_gtf,
                       junction_tab_file=junction_tab_file_list,
                       sjdboverhang=sjdboverhang,
                       genomeSAindexNbases=genomeSAindexNbases,
                       genomeChrBinNbits=genomeChrBinNbits,
                       genome_size=genome_size)

        sample_list = samples if samples else self.get_sample_list(samples_dir)

        FileRoutines.safe_mkdir(output_dir)

        for sample in sample_list:
            print("Handling %s" % sample)
            sample_dir = "%s/%s/" % (samples_dir, sample)
            alignment_sample_dir = "%s/%s/" % (output_dir, sample)
            FileRoutines.safe_mkdir(alignment_sample_dir)
            filetypes, forward_files, reverse_files = FileRoutines.make_lists_forward_and_reverse_files(
                sample_dir)

            print "\tAligning reads..."

            STAR.align(
                genome_dir,
                forward_files,
                reverse_read_list=reverse_files,
                annotation_gtf=annotation_gtf if not genome_fasta else None,
                feature_from_gtf_to_use_as_exon=feature_from_gtf_to_use_as_exon,
                exon_tag_to_use_as_transcript_id=
                exon_tag_to_use_as_transcript_id,
                exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id,
                length_of_sequences_flanking_junction=
                length_of_sequences_flanking_junction,
                junction_tab_file_list=junction_tab_file_list,
                three_prime_trim=three_prime_trim,
                five_prime_trim=five_prime_trim,
                adapter_seq_for_three_prime_clip=
                adapter_seq_for_three_prime_clip,
                max_mismatch_percent_for_adapter_trimming=
                max_mismatch_percent_for_adapter_trimming,
                three_prime_trim_after_adapter_clip=
                three_prime_trim_after_adapter_clip,
                output_type=output_type,
                sort_bam=sort_bam,
                max_memory_for_bam_sorting=max_memory_for_bam_sorting,
                include_unmapped_reads_in_bam=include_unmapped_reads_in_bam,
                output_unmapped_reads=output_unmapped_reads,
                output_dir=alignment_sample_dir,
                two_pass_mode=two_pass_mode,
                max_intron_length=max_intron_length)

            print "\tIndexing bam file..."
            resulting_bam_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_dir
            SamtoolsV1.index(resulting_bam_file)
Esempio n. 27
0
    def parallel_codeml(self,
                        in_dir,
                        tree_file,
                        out_dir,
                        seq_type="codons",
                        codon_frequency="F3X4",
                        noisy=0,
                        verbose="concise",
                        runmode=0,
                        clock=0,
                        aminoacid_distance=None,
                        model=1,
                        nssites=0,
                        genetic_code=0,
                        fix_kappa=False,
                        kappa=5,
                        fix_omega=False,
                        omega=0.2,
                        getSE=0,
                        RateAncestor=0,
                        small_difference=0.000001,
                        clean_data=True,
                        method=0,
                        Mgene=None):

        FileRoutines.safe_mkdir(out_dir)
        alignment_files_list = FileRoutines.make_list_of_path_to_files(in_dir)
        tree_file_abs_path = os.path.abspath(tree_file)
        options_list = []
        dir_list = []
        for filename in alignment_files_list:
            directory, basename, extension = FileRoutines.split_filename(
                filename)
            filename_out_dir = os.path.abspath("%s/%s/" % (out_dir, basename))
            out_file = "%s/%s.out" % (filename_out_dir, basename)
            ctl_file = "%s/%s.ctl" % (filename_out_dir, basename)

            options_list.append(ctl_file)
            dir_list.append(filename_out_dir)
            FileRoutines.safe_mkdir(filename_out_dir)
            self.generate_ctl_file(os.path.abspath(filename),
                                   tree_file_abs_path,
                                   out_file,
                                   ctl_file,
                                   seq_type=seq_type,
                                   codon_frequency=codon_frequency,
                                   noisy=noisy,
                                   verbose=verbose,
                                   runmode=runmode,
                                   clock=clock,
                                   aminoacid_distance=aminoacid_distance,
                                   model=model,
                                   nssites=nssites,
                                   genetic_code=genetic_code,
                                   fix_kappa=fix_kappa,
                                   kappa=kappa,
                                   fix_omega=fix_omega,
                                   omega=omega,
                                   getSE=getSE,
                                   RateAncestor=RateAncestor,
                                   Mgene=Mgene,
                                   small_difference=small_difference,
                                   clean_data=clean_data,
                                   method=method)
        self.parallel_execute(options_list, dir_list=dir_list)
Esempio n. 28
0
                    action="store_true",
                    dest="header",
                    default=False,
                    help="Set if header is present in input file")
parser.add_argument("-u",
                    "--use_column_value_as_prefix",
                    action="store_true",
                    dest="use_column_value_as_prefix",
                    default=False,
                    help="Use column value as prefix for output files")
parser.add_argument(
    "-r",
    "--sorted_input",
    action="store_true",
    dest="sorted_input",
    default=False,
    help=
    "Input file is sorted. Do it to reduce number of simultaneously opened files"
)

args = parser.parse_args()

FileRoutines.split_by_column(
    args.input_file,
    args.column_number,
    separator=args.separator,
    header=args.header,
    outfile_prefix=args.output_prefix,
    use_column_value_as_prefix=args.use_column_value_as_prefix,
    sorted_input=args.sorted_input)
parser.add_argument("-i",
                    "--input_vcf",
                    action="store",
                    dest="input_vcf",
                    required=True,
                    help="Input vcf file")
parser.add_argument("-o",
                    "--output_vcf",
                    action="store",
                    dest="output_vcf",
                    required=True,
                    help="Output vcf file")
parser.add_argument("-r",
                    "--reference",
                    action="store",
                    dest="reference",
                    required=True,
                    help="Fasta with reference genome")
parser.add_argument("-g",
                    "--gatk_directory",
                    action="store",
                    dest="gatk_dir",
                    default="",
                    help="Directory with GATK jar")
args = parser.parse_args()

SelectVariants.jar_path = FileRoutines.check_path(args.gatk_dir)
SelectVariants.remove_entries_with_filters(args.reference, args.input_vcf,
                                           args.output_vcf)
Esempio n. 30
0
    def extract_sequences_by_clusters(self,
                                      dir_with_cluster_files,
                                      dir_with_sequence_files,
                                      output_dir,
                                      file_with_white_list_cluster_ids=None,
                                      mode="families",
                                      sequence_file_extension="fasta",
                                      sequence_file_format="fasta",
                                      label_species=False,
                                      separator_for_labeling="@",
                                      species_label_first=True):
        """
        basenames of cluster and sequence files must be same

        mode:
            clusters - extract sequences from clusters in separate files,
            species - extract sequences from species to separate files
        """
        white_list_ids = None
        if file_with_white_list_cluster_ids:
            white_list_ids = IdSet()
            white_list_ids.read(file_with_white_list_cluster_ids)

        clusters_dict = self.read_cluster_files_from_dir(
            dir_with_cluster_files)
        cluster_names = self.get_cluster_names(clusters_dict,
                                               white_list_ids=white_list_ids)

        sequence_super_dict = OrderedDict()
        out_dir = FileRoutines.check_path(output_dir)

        for species in clusters_dict:
            idx_file = "%s_tmp.idx" % species
            sequence_file = "%s%s.%s" % (FileRoutines.check_path(
                dir_with_sequence_files), species, sequence_file_extension)
            sequence_super_dict[species] = SeqIO.index_db(
                idx_file, sequence_file, format=sequence_file_format)

        if mode == "species":
            seqeuence_names = self.get_sequence_names(
                clusters_dict,
                write_ids=False,
                out_prefix=None,
                white_list_ids=white_list_ids)
            for species in seqeuence_names:
                out_file = "%s%s.%s" % (out_dir, species,
                                        sequence_file_extension)
                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    sequence_super_dict[species], seqeuence_names[species]),
                            out_file,
                            format=sequence_file_format)
        elif mode == "families":

            def per_family_record_generator(seq_super_dict, clust_dict,
                                            cluster_id):
                if species_label_first:
                    label_sequence = lambda label, name: "%s%s%s" % (
                        label, separator_for_labeling, name)
                else:
                    label_sequence = lambda label, name: "%s%s%s" % (
                        name, separator_for_labeling, label)

                for species in seq_super_dict:
                    #print species, cluster_id
                    for record_id in clust_dict[species][cluster_id]:
                        if label_species:
                            record = deepcopy(
                                seq_super_dict[species][record_id])
                            record.id = label_sequence(species, record_id)
                            yield record
                        else:
                            yield seq_super_dict[species][record_id]

            for cluster_name in cluster_names:
                out_file = "%s%s.%s" % (out_dir, cluster_name,
                                        sequence_file_extension)
                SeqIO.write(per_family_record_generator(
                    sequence_super_dict, clusters_dict, cluster_name),
                            out_file,
                            format=sequence_file_format)

        for species in clusters_dict:
            os.remove("%s_tmp.idx" % species)