Beispiel #1
0
    def parallel_blast(self,
                       blast_command,
                       seqfile,
                       database,
                       outfile=None,
                       blast_options=None,
                       split_dir="splited_fasta",
                       splited_output_dir="splited_output_dir",
                       evalue=None,
                       output_format=None,
                       threads=None,
                       num_of_seqs_per_scan=None,
                       combine_output_to_single_file=True,
                       async_run=False,
                       external_process_pool=None):

        splited_dir = FileRoutines.check_path(split_dir)
        splited_out_dir = FileRoutines.check_path(splited_output_dir)
        self.safe_mkdir(splited_dir)
        self.safe_mkdir(splited_out_dir)

        number_of_files = num_of_seqs_per_scan if num_of_seqs_per_scan else 5 * threads if threads else 5 * self.threads
        self.split_fasta(seqfile, splited_dir, num_of_files=number_of_files)
        input_list_of_files = sorted(os.listdir(splited_dir))
        list_of_files = []

        for filename in input_list_of_files:
            filename_prefix = FileRoutines.split_filename(filename)[1]

            input_file = "%s%s" % (splited_dir, filename)
            output_file = "%s%s.hits" % (splited_out_dir, filename_prefix)

            list_of_files.append((input_file, output_file))

        options_list = []
        out_files = []

        for in_file, out_filename in list_of_files:

            options = " -out %s" % out_filename

            options += " -db %s" % database
            options += " -query %s" % in_file
            options += " %s" % blast_options if blast_options else ""
            options += " -evalue %s" % evalue if evalue else ""
            options += " -outfmt %i" % output_format if output_format else ""
            options_list.append(options)
            out_files.append(out_filename)

        self.parallel_execute(options_list,
                              cmd=blast_command,
                              threads=threads,
                              async_run=async_run,
                              external_process_pool=external_process_pool)

        if combine_output_to_single_file:
            CGAS.cat(out_files, output=outfile)
Beispiel #2
0
    def get_ids_from_hmm3(hmmfile, ids_file=None, return_ids_list=False):
        """
        Extracts ids from hmm3 file:
            return_ids_list == True: captures output and returns ids_list, ids_file is ignored
            return_ids_list == False and ids_file == None: writes ids to stdout
            return_ids_list == False and ids_file != None: writes ids to ids_file

        """
        return CGAS.cgas(hmmfile, grep_pattern="NAME", whole_word_match=True, awk_code="{print $2}",
                         capture_output=return_ids_list, output=ids_file if not return_ids_list else None)
Beispiel #3
0
    def split_hmm(self, hmmfile, output_dir, num_of_recs_per_file, num_of_files=None, output_prefix=None, threads=4):

        try:
            os.mkdir(output_dir)
        except OSError:
            pass

        id_fd = CGAS.cgas(hmmfile, grep_pattern="NAME", whole_word_match=True, awk_code="{print $2}",
                          capture_output=True)

        split_index = 1
        ids_written = 0
        ids_list = IdList()
        #ids_list = read_ids(id_fd, close_after_if_file_object=False)
        ids_list.read(id_fd, close_after_if_file_object=True)
        number_of_ids = len(ids_list)
        out_prefix = self.split_filename(hmmfile)[1] if output_prefix is None else output_prefix

        num_of_ids = int(number_of_ids/num_of_files) + 1 if num_of_files else num_of_recs_per_file

        common_options = " -f"
        common_options += " %s" % hmmfile
        options_list = []
        while (ids_written + num_of_ids) <= number_of_ids:
            tmp_id_list = IdList(ids_list[ids_written:ids_written+num_of_ids])
            tmp_id_list.write("%s/%s_%i.ids" % (output_dir, out_prefix, split_index))

            options = common_options
            options += " %s/%s_%i.ids" % (output_dir, out_prefix, split_index)
            options += " > %s" % ("%s/%s_%i.hmm" % (output_dir, out_prefix, split_index))
            options_list.append(options)

            split_index += 1
            ids_written += num_of_ids

        if ids_written != number_of_ids:
            tmp_id_list = IdList(ids_list[ids_written:])
            tmp_id_list.write("%s/%s_%i.ids" % (output_dir, out_prefix, split_index))

            options = common_options
            options += " %s/%s_%i.ids" % (output_dir, out_prefix, split_index)
            options += " > %s" % ("%s/%s_%i.hmm" % (output_dir, out_prefix, split_index))
            options_list.append(options)

            split_index += 1
        #print options_list
        self.parallel_execute(options_list, cmd="hmmfetch", threads=threads)
Beispiel #4
0
 def extract_gene_ids_from_output(augustus_output, all_genes_output):
     CGAS.cgas(augustus_output,
               grep_pattern="'\\tgene\\t'",
               sed_string="'s/.*ID=//'",
               output=all_genes_output,
               grep_use_regexp=True)
Beispiel #5
0
 def extract_CDS_annotations_from_output(augustus_output, CDS_output):
     CGAS.grep("'\\tCDS\\t'",
               augustus_output,
               output=CDS_output,
               use_regexp=True)
Beispiel #6
0
    def parallel_predict(self,
                         species,
                         genome_file,
                         output,
                         strand="both",
                         gene_model=None,
                         output_gff3=True,
                         other_options="",
                         split_dir="splited_input",
                         splited_output_dir="splited_output_dir",
                         config_dir=None,
                         combine_output_to_single_file=True,
                         use_softmasking=None,
                         hints_file=None,
                         extrinsicCfgFile=None,
                         predict_UTR=None,
                         external_process_pool=None,
                         async_run=False,
                         min_intron_len=None,
                         parsing_mode="parse"):
        common_options = self.parse_options(species,
                                            genome_file="",
                                            strand=strand,
                                            gene_model=gene_model,
                                            output_gff3=output_gff3,
                                            other_options=other_options,
                                            config_dir=config_dir,
                                            use_softmasking=use_softmasking,
                                            hints_file=hints_file,
                                            extrinsicCfgFile=extrinsicCfgFile,
                                            predict_UTR=predict_UTR,
                                            min_intron_len=min_intron_len)

        splited_dir = FileRoutines.check_path(split_dir)
        splited_out_dir = FileRoutines.check_path(splited_output_dir)
        FileRoutines.safe_mkdir(splited_dir)
        FileRoutines.safe_mkdir(splited_out_dir)

        self.split_fasta_by_seq_len(genome_file,
                                    splited_dir,
                                    parsing_mode=parsing_mode)

        input_list_of_files = sorted(os.listdir(splited_dir))
        list_of_output_files = []
        options_list = []
        for filename in input_list_of_files:
            input_file = "%s%s" % (splited_dir, filename)
            output_file = "%s%s.gff" % (splited_out_dir, filename)
            list_of_output_files.append(output_file)
            options = common_options

            options += " %s" % input_file
            options += " > %s" % output_file
            options_list.append(options)

        self.parallel_execute(options_list,
                              external_process_pool=external_process_pool,
                              async_run=async_run)

        if combine_output_to_single_file:
            CGAS.cat(list_of_output_files, output=output)
Beispiel #7
0
    def parallel_hmmscan(self, hmmfile, seqfile, outfile, num_of_seqs_per_scan=None, split_dir="splited_fasta",
                         splited_output_dir="splited_output_dir",
                         tblout_outfile=None, domtblout_outfile=None, pfamtblout_outfile=None,
                         splited_tblout_dir=None, splited_domtblout_dir=None, splited_pfamtblout_dir=None,
                         dont_output_alignments=False, model_evalue_threshold=None, model_score_threshold=None,
                         domain_evalue_threshold=None, domain_score_threshold=None,
                         model_evalue_significant_threshold=None, model_score_significant_threshold=None,
                         domain_evalue_significant_threshold=None, domain_score_significant_threshold=None,
                         use_profile_GA_gathering_cutoffs_for_thresholds=False,
                         use_profile_NC_noise_cutoffs_for_thresholds=False,
                         use_profile_TC_trusted_cutoffs_for_thresholds=False,
                         turn_off_all_heruristics=False, turn_off_bias_filter=False,
                         MSV_threshold=None, Vit_threshold=None, Fwd_threshold=None,
                         turn_off_biased_composition_score_corrections=None,
                         input_format=None, threads=None, combine_output_to_single_file=True,
                         biopython_165_compartibility=False,
                         remove_tmp_dirs=True,
                         async_run=False, external_process_pool=None
                         ):

        splited_dir = self.check_path(split_dir)
        splited_out_dir = self.check_path(splited_output_dir)
        self.safe_mkdir(splited_dir)
        self.safe_mkdir(splited_out_dir)

        if splited_tblout_dir:
            self.safe_mkdir(splited_tblout_dir)
        if splited_domtblout_dir:
            self.safe_mkdir(splited_domtblout_dir)
        if splited_pfamtblout_dir:
            self.safe_mkdir(splited_pfamtblout_dir)

        number_of_files = num_of_seqs_per_scan if num_of_seqs_per_scan else 5 * threads if threads else 5 * self.threads
        self.split_fasta(seqfile, splited_dir, num_of_files=number_of_files)
        input_list_of_files = sorted(os.listdir(splited_dir))
        list_of_files = []

        for filename in input_list_of_files:
            filename_prefix = self.split_filename(filename)[1]

            input_file = "%s%s" % (splited_dir, filename)
            output_file = "%s%s.hits" % (splited_out_dir, filename_prefix)
            tblout_file = "%s%s.hits" % (splited_tblout_dir, filename_prefix) if splited_tblout_dir else None
            domtblout_file = "%s%s.hits" % (splited_domtblout_dir, filename_prefix) if splited_domtblout_dir else None
            pfamtblout_file = "%s%s.hits" % (splited_pfamtblout_dir, filename_prefix) if splited_pfamtblout_dir else None

            list_of_files.append((input_file, output_file, tblout_file, domtblout_file, pfamtblout_file))

        common_options = self.__parse_hmmsxxx_common_options(tblout=None, domtblout=None,
                                                             pfamtblout=None,
                                                             dont_output_alignments=dont_output_alignments,
                                                             model_evalue_threshold=model_evalue_threshold,
                                                             model_score_threshold=model_score_threshold,
                                                             domain_evalue_threshold=domain_evalue_threshold,
                                                             domain_score_threshold=domain_score_threshold,
                                                             model_evalue_significant_threshold=model_evalue_significant_threshold,
                                                             model_score_significant_threshold=model_score_significant_threshold,
                                                             domain_evalue_significant_threshold=domain_evalue_significant_threshold,
                                                             domain_score_significant_threshold=domain_score_significant_threshold,
                                                             use_profile_GA_gathering_cutoffs_for_thresholds=use_profile_GA_gathering_cutoffs_for_thresholds,
                                                             use_profile_NC_noise_cutoffs_for_thresholds=use_profile_NC_noise_cutoffs_for_thresholds,
                                                             use_profile_TC_trusted_cutoffs_for_thresholds=use_profile_TC_trusted_cutoffs_for_thresholds,
                                                             turn_off_all_heruristics=turn_off_all_heruristics,
                                                             turn_off_bias_filter=turn_off_bias_filter,
                                                             MSV_threshold=MSV_threshold, Vit_threshold=Vit_threshold,
                                                             Fwd_threshold=Fwd_threshold,
                                                             turn_off_biased_composition_score_corrections=turn_off_biased_composition_score_corrections)
        common_options += " --cpu %i" % 5
        common_options += " --qformat %s" if input_format else ""
        options_list = []
        out_files = []
        tblout_files = []
        domtblout_files = []
        pfamtblout_files = []

        for in_file, out_filename, tblout_file, domtblout_file, pfamtblout_file in list_of_files:
            options = common_options

            options += " --tblout %s" % tblout_file if tblout_file else ""
            options += " --domtblout %s" % domtblout_file if domtblout_file else ""
            options += " --pfamtblout %s" % pfamtblout_file if pfamtblout_file else ""
            options += " -o %s" % out_filename

            options += " %s" % hmmfile
            options += " %s" % in_file

            options_list.append(options)
            out_files.append(out_filename)
            tblout_files.append(tblout_file)
            domtblout_files.append(domtblout_file)
            pfamtblout_files.append(pfamtblout_file)

        self.parallel_execute(options_list, cmd="hmmscan", threads=threads, async_run=async_run,
                              external_process_pool=external_process_pool)

        if combine_output_to_single_file:
            if biopython_165_compartibility:
                CGAS.cgas(out_files, sed_string="s/^Description:.*/Description: <unknown description>/", output=outfile)
            else:
                CGAS.cat(out_files, output=outfile)
        if tblout_outfile:
            CGAS.cat(tblout_files, output=tblout_outfile)
        if domtblout_outfile:
            CGAS.cat(domtblout_files, output=domtblout_outfile)
        if pfamtblout_outfile:
            CGAS.cat(pfamtblout_files, output=pfamtblout_outfile)

        if remove_tmp_dirs:
            if splited_tblout_dir:
                shutil.rmtree(splited_tblout_dir)
            if splited_domtblout_dir:
                shutil.rmtree(splited_domtblout_dir)
            if splited_pfamtblout_dir:
                shutil.rmtree(splited_pfamtblout_dir)
            for tmp_dir in splited_dir, splited_out_dir:
                shutil.rmtree(tmp_dir)