def RepeatModeler_search(query_file, db_name, output_file="run.out", num_of_threads=5, RepeatModeler_dir=""): print("\nRepeatModeler search...\n") repmod_dir = FileRoutines.check_path(RepeatModeler_dir) os.system(repmod_dir + "BuildDatabase -engine ncbi -name %s %s" % (db_name, query_file)) os.system(repmod_dir + "RepeatModeler -engine ncbi -pa %i -database %s > %s" % (num_of_threads, db_name, output_file))
def TRF_search(query_file, match=2, mismatch=7, delta=7, PM=80, PI=10, minscore=50, max_period=500, flanked=False, TRF_dir=""): print("\nTRF search...\n") #use: trf File Match Mismatch Delta PM PI Minscore MaxPeriod [options] #Where: (all weights, penalties, and scores are positive) # File = sequences input file # Match = matching weight # Mismatch = mismatching penalty # Delta = indel penalty # PM = match probability (whole number) # PI = indel probability (whole number) # Minscore = minimum alignment score to report # MaxPeriod = maximum period size to report # [options] = one or more of the following : # -m masked sequence file # -f flanking sequence # -d data file # -h suppress HTML output #Recomended options: trf yoursequence.txt 2 7 7 80 10 50 500 -f -d -m flanking = "" if flanked: flanking = "-f" trf_path = FileRoutines.check_path(TRF_dir) os.system(trf_path + "trf %s %i %i %i %i %i %i %i %s -d -m" % (query_file, match, mismatch, delta, PM, PI, minscore, max_period, flanking))
def parallel_blast(self, blast_command, seqfile, database, outfile=None, blast_options=None, split_dir="splited_fasta", splited_output_dir="splited_output_dir", evalue=None, output_format=None, threads=None, num_of_seqs_per_scan=None, combine_output_to_single_file=True, async_run=False, external_process_pool=None): splited_dir = FileRoutines.check_path(split_dir) splited_out_dir = FileRoutines.check_path(splited_output_dir) self.safe_mkdir(splited_dir) self.safe_mkdir(splited_out_dir) number_of_files = num_of_seqs_per_scan if num_of_seqs_per_scan else 5 * threads if threads else 5 * self.threads self.split_fasta(seqfile, splited_dir, num_of_files=number_of_files) input_list_of_files = sorted(os.listdir(splited_dir)) list_of_files = [] for filename in input_list_of_files: filename_prefix = FileRoutines.split_filename(filename)[1] input_file = "%s%s" % (splited_dir, filename) output_file = "%s%s.hits" % (splited_out_dir, filename_prefix) list_of_files.append((input_file, output_file)) options_list = [] out_files = [] for in_file, out_filename in list_of_files: options = " -out %s" % out_filename options += " -db %s" % database options += " -query %s" % in_file options += " %s" % blast_options if blast_options else "" options += " -evalue %s" % evalue if evalue else "" options += " -outfmt %i" % output_format if output_format else "" options_list.append(options) out_files.append(out_filename) self.parallel_execute(options_list, cmd=blast_command, threads=threads, async_run=async_run, external_process_pool=external_process_pool) if combine_output_to_single_file: CGAS.cat(out_files, output=outfile)
def RepeatMasker_search(query_file, species, custom_lib_path=None, RepeatMasker_dir="", num_of_threads=5, search_type="-s"): #species: see list of possible species in repeatmasker.help coming with RepeatMasker #search type: "-s" (sensetive), "" (default), "-q" (fast), "-qq" (very fast) repmask_dir = FileRoutines.check_path(RepeatMasker_dir) custom_lib = "" if custom_lib_path: cuatom_lib = "-lib %s" % custom_lib_path #additional options: #-xm creates an additional output file in cross_match format (for parsing) #-ace creates an additional output file in ACeDB format #-gff creates an additional Gene Feature Finding format #-excln The percentages displayed in the .tbl file are calculated using a # total sequence length excluding runs of 25 Ns or more. print("\nRepeatMasker search...\n") os.system(repmask_dir + "RepeatMasker -excln -xm -ace -gff %s -pa %i -species %s %s %s" % (custom_lib, num_of_threads, species, search_type, query_file))
def extract_proteins_from_selected_families( families_id_file, fam_file, pep_file, output_dir="./", pep_format="fasta", out_prefix=None, create_dir_for_each_family=False): from RouToolPa.Routines import SequenceRoutines fam_id_list = IdList() fam_dict = SynDict() #print(pep_file) FileRoutines.safe_mkdir(output_dir) out_dir = FileRoutines.check_path(output_dir) create_directory_for_each_family = True if out_prefix else create_dir_for_each_family if families_id_file: fam_id_list.read(families_id_file) fam_dict.read(fam_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format) for fam_id in fam_id_list if families_id_file else fam_dict: if fam_id in fam_dict: if create_directory_for_each_family: fam_dir = "%s%s/" % (out_dir, fam_id) FileRoutines.safe_mkdir(fam_dir) out_file = "%s%s.pep" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.pep" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, fam_dict[fam_id], verbose=True), out_file, format=pep_format) else: print("%s was not found" % fam_id) os.remove("tmp.idx")
def make_fasta_dict(fasta_file, dict_name, PICARD_dir=""): picard_dir = FileRoutines.check_path(PICARD_dir) os.system("java -jar %sCreateSequenceDictionary.jar R= %s O= %s" % (picard_dir, fasta_file, dict_name))
def parallel_predict(self, species, genome_file, output, strand="both", gene_model=None, output_gff3=True, other_options="", split_dir="splited_input", splited_output_dir="splited_output_dir", config_dir=None, combine_output_to_single_file=True, use_softmasking=None, hints_file=None, extrinsicCfgFile=None, predict_UTR=None, external_process_pool=None, async_run=False, min_intron_len=None, parsing_mode="parse"): common_options = self.parse_options(species, genome_file="", strand=strand, gene_model=gene_model, output_gff3=output_gff3, other_options=other_options, config_dir=config_dir, use_softmasking=use_softmasking, hints_file=hints_file, extrinsicCfgFile=extrinsicCfgFile, predict_UTR=predict_UTR, min_intron_len=min_intron_len) splited_dir = FileRoutines.check_path(split_dir) splited_out_dir = FileRoutines.check_path(splited_output_dir) FileRoutines.safe_mkdir(splited_dir) FileRoutines.safe_mkdir(splited_out_dir) self.split_fasta_by_seq_len(genome_file, splited_dir, parsing_mode=parsing_mode) input_list_of_files = sorted(os.listdir(splited_dir)) list_of_output_files = [] options_list = [] for filename in input_list_of_files: input_file = "%s%s" % (splited_dir, filename) output_file = "%s%s.gff" % (splited_out_dir, filename) list_of_output_files.append(output_file) options = common_options options += " %s" % input_file options += " > %s" % output_file options_list.append(options) self.parallel_execute(options_list, external_process_pool=external_process_pool, async_run=async_run) if combine_output_to_single_file: CGAS.cat(list_of_output_files, output=output)
def windowmasker_search(windowmasker_dir): winmask_dir = FileRoutines.check_path(windowmasker_dir) #TODO: write this function pass
def rmout2gff3(rmoutfile, outfile, RepeatMaskerUtils_dir=""): repmaskutils_dir = FileRoutines.check_path(RepeatMaskerUtils_dir) os.system(repmaskutils_dir + "rmOutToGFF3.pl %s > %s" % (rmoutfile, outfile))
def extract_repbase(species, output_file="RepBase.fasta", RepeatMaskerUtils_dir=""): print("\nExtracting RepBase for %s\n" % species) repmaskutils_dir = FileRoutines.check_path(RepeatMaskerUtils_dir) os.system(repmaskutils_dir + "queryRepeatDatabase.pl -species %s > %s" % (species, output_file))