Beispiel #1
0
def RepeatModeler_search(query_file, db_name, output_file="run.out",
                         num_of_threads=5, RepeatModeler_dir=""):
    print("\nRepeatModeler search...\n")
    repmod_dir = FileRoutines.check_path(RepeatModeler_dir)
    os.system(repmod_dir + "BuildDatabase -engine ncbi  -name %s %s" % (db_name, query_file))
    os.system(repmod_dir + "RepeatModeler -engine ncbi -pa %i -database %s > %s"
              % (num_of_threads, db_name, output_file))
Beispiel #2
0
def TRF_search(query_file, match=2, mismatch=7, delta=7, PM=80,
               PI=10, minscore=50, max_period=500, flanked=False, TRF_dir=""):

    print("\nTRF search...\n")
    #use: trf File Match Mismatch Delta PM PI Minscore MaxPeriod [options]
    #Where: (all weights, penalties, and scores are positive)
    # File = sequences input file
    # Match = matching weight
    # Mismatch = mismatching penalty
    # Delta = indel penalty
    # PM = match probability (whole number)
    # PI = indel probability (whole number)
    # Minscore = minimum alignment score to report
    # MaxPeriod = maximum period size to report
    # [options] = one or more of the following :
    # -m masked sequence file
    # -f flanking sequence
    # -d data file
    # -h suppress HTML output
    #Recomended options: trf yoursequence.txt 2 7 7 80 10 50 500 -f -d -m
    flanking = ""
    if flanked:
        flanking = "-f"

    trf_path = FileRoutines.check_path(TRF_dir)
    os.system(trf_path + "trf %s %i %i %i %i %i %i %i %s -d -m"
              % (query_file, match, mismatch, delta, PM, PI, minscore, max_period, flanking))
Beispiel #3
0
    def parallel_blast(self, blast_command, seqfile, database, outfile=None,
                       blast_options=None, split_dir="splited_fasta",
                       splited_output_dir="splited_output_dir",
                       evalue=None, output_format=None,
                       threads=None, num_of_seqs_per_scan=None,
                       combine_output_to_single_file=True,
                       async_run=False,
                       external_process_pool=None):

        splited_dir = FileRoutines.check_path(split_dir)
        splited_out_dir = FileRoutines.check_path(splited_output_dir)
        self.safe_mkdir(splited_dir)
        self.safe_mkdir(splited_out_dir)

        number_of_files = num_of_seqs_per_scan if num_of_seqs_per_scan else 5 * threads if threads else 5 * self.threads
        self.split_fasta(seqfile, splited_dir, num_of_files=number_of_files)
        input_list_of_files = sorted(os.listdir(splited_dir))
        list_of_files = []

        for filename in input_list_of_files:
            filename_prefix = FileRoutines.split_filename(filename)[1]

            input_file = "%s%s" % (splited_dir, filename)
            output_file = "%s%s.hits" % (splited_out_dir, filename_prefix)

            list_of_files.append((input_file, output_file))

        options_list = []
        out_files = []

        for in_file, out_filename in list_of_files:

            options = " -out %s" % out_filename

            options += " -db %s" % database
            options += " -query %s" % in_file
            options += " %s" % blast_options if blast_options else ""
            options += " -evalue %s" % evalue if evalue else ""
            options += " -outfmt %i" % output_format if output_format else ""
            options_list.append(options)
            out_files.append(out_filename)

        self.parallel_execute(options_list, cmd=blast_command, threads=threads, async_run=async_run,
                              external_process_pool=external_process_pool)

        if combine_output_to_single_file:
            CGAS.cat(out_files, output=outfile)
Beispiel #4
0
def RepeatMasker_search(query_file, species, custom_lib_path=None, RepeatMasker_dir="",
                        num_of_threads=5, search_type="-s"):

    #species: see list of possible species in repeatmasker.help coming with RepeatMasker
    #search type: "-s" (sensetive), "" (default), "-q" (fast), "-qq" (very fast)

    repmask_dir = FileRoutines.check_path(RepeatMasker_dir)
    custom_lib = ""
    if custom_lib_path:
        cuatom_lib = "-lib %s" % custom_lib_path

    #additional options:
    #-xm    creates an additional output file in cross_match format (for parsing)
    #-ace   creates an additional output file in ACeDB format
    #-gff   creates an additional Gene Feature Finding format
    #-excln The percentages displayed in the .tbl file are calculated using a
    #       total sequence length excluding runs of 25 Ns or more.
    print("\nRepeatMasker search...\n")
    os.system(repmask_dir + "RepeatMasker -excln -xm -ace -gff %s -pa %i -species %s %s %s"
              % (custom_lib, num_of_threads, species, search_type, query_file))
Beispiel #5
0
    def extract_proteins_from_selected_families(
            families_id_file,
            fam_file,
            pep_file,
            output_dir="./",
            pep_format="fasta",
            out_prefix=None,
            create_dir_for_each_family=False):
        from RouToolPa.Routines import SequenceRoutines

        fam_id_list = IdList()
        fam_dict = SynDict()
        #print(pep_file)
        FileRoutines.safe_mkdir(output_dir)
        out_dir = FileRoutines.check_path(output_dir)
        create_directory_for_each_family = True if out_prefix else create_dir_for_each_family
        if families_id_file:
            fam_id_list.read(families_id_file)
        fam_dict.read(fam_file, split_values=True, values_separator=",")
        protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format)

        for fam_id in fam_id_list if families_id_file else fam_dict:
            if fam_id in fam_dict:
                if create_directory_for_each_family:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    FileRoutines.safe_mkdir(fam_dir)
                    out_file = "%s%s.pep" % (fam_dir, out_prefix
                                             if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.pep" % (out_dir, out_prefix
                                              if out_prefix else fam_id)

                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    protein_dict, fam_dict[fam_id], verbose=True),
                            out_file,
                            format=pep_format)
            else:
                print("%s was not found" % fam_id)

        os.remove("tmp.idx")
Beispiel #6
0
def make_fasta_dict(fasta_file, dict_name, PICARD_dir=""):
    picard_dir = FileRoutines.check_path(PICARD_dir)
    os.system("java -jar %sCreateSequenceDictionary.jar R= %s O= %s" %
              (picard_dir, fasta_file, dict_name))
Beispiel #7
0
    def parallel_predict(self,
                         species,
                         genome_file,
                         output,
                         strand="both",
                         gene_model=None,
                         output_gff3=True,
                         other_options="",
                         split_dir="splited_input",
                         splited_output_dir="splited_output_dir",
                         config_dir=None,
                         combine_output_to_single_file=True,
                         use_softmasking=None,
                         hints_file=None,
                         extrinsicCfgFile=None,
                         predict_UTR=None,
                         external_process_pool=None,
                         async_run=False,
                         min_intron_len=None,
                         parsing_mode="parse"):
        common_options = self.parse_options(species,
                                            genome_file="",
                                            strand=strand,
                                            gene_model=gene_model,
                                            output_gff3=output_gff3,
                                            other_options=other_options,
                                            config_dir=config_dir,
                                            use_softmasking=use_softmasking,
                                            hints_file=hints_file,
                                            extrinsicCfgFile=extrinsicCfgFile,
                                            predict_UTR=predict_UTR,
                                            min_intron_len=min_intron_len)

        splited_dir = FileRoutines.check_path(split_dir)
        splited_out_dir = FileRoutines.check_path(splited_output_dir)
        FileRoutines.safe_mkdir(splited_dir)
        FileRoutines.safe_mkdir(splited_out_dir)

        self.split_fasta_by_seq_len(genome_file,
                                    splited_dir,
                                    parsing_mode=parsing_mode)

        input_list_of_files = sorted(os.listdir(splited_dir))
        list_of_output_files = []
        options_list = []
        for filename in input_list_of_files:
            input_file = "%s%s" % (splited_dir, filename)
            output_file = "%s%s.gff" % (splited_out_dir, filename)
            list_of_output_files.append(output_file)
            options = common_options

            options += " %s" % input_file
            options += " > %s" % output_file
            options_list.append(options)

        self.parallel_execute(options_list,
                              external_process_pool=external_process_pool,
                              async_run=async_run)

        if combine_output_to_single_file:
            CGAS.cat(list_of_output_files, output=output)
Beispiel #8
0
def windowmasker_search(windowmasker_dir):
    winmask_dir = FileRoutines.check_path(windowmasker_dir)
    #TODO: write this function
    pass
Beispiel #9
0
def rmout2gff3(rmoutfile, outfile, RepeatMaskerUtils_dir=""):
    repmaskutils_dir = FileRoutines.check_path(RepeatMaskerUtils_dir)
    os.system(repmaskutils_dir + "rmOutToGFF3.pl %s > %s" % (rmoutfile, outfile))
Beispiel #10
0
def extract_repbase(species, output_file="RepBase.fasta", RepeatMaskerUtils_dir=""):
    print("\nExtracting RepBase for %s\n" % species)
    repmaskutils_dir = FileRoutines.check_path(RepeatMaskerUtils_dir)
    os.system(repmaskutils_dir + "queryRepeatDatabase.pl -species %s > %s" % (species, output_file))