def nonltr(self): print 'nonltr: starting' start = time.time() # nonltr #cmd0 = self.base_path + "/nonltr/run_MGEScan.pl \ # -genome=%(genome_dir)s \ # -data=%(data_dir)s \ # -hmmerv=%(hmmerv)s" cmd0 = "python " + self.base_path + "/nonltr/nonltr.py " + \ "%(genome_dir)s " + \ "%(data_dir)s " if self.mpi_enabled: #cmd0 = (cmd0 + " -mpi=%(mpi_enabled)s") cmd0 = (cmd0 + " --mpi=%(mpi_enabled)s") res0 = self.run_cmd(cmd0) # gff3 self.nonltr_out_path = utils.get_abspath(self.data_dir + "/info/full/") self.nonltr_gff_path = utils.get_abspath(self.data_dir + "/info/nonltr.gff3") cmd1 = self.base_path + "/nonltr/toGFF.py %(nonltr_out_path)s %(nonltr_gff_path)s" res1 = self.run_cmd(cmd1) end = time.time() print ('nonltr: finishing (elapsed time: {0} secs)'.format(int(round(end - start))))
def forward_strand(self): mypath = self.plus_dir out_dir = self.plus_out_dir for (dirpath, dirnames, filenames) in os.walk(mypath): break for name in filenames: file_path = utils.get_abspath(dirpath + "/" + name) # Rename to sequence id sid = getid(file_path) new_path = utils.get_abspath(dirpath + "/" + sid) os.rename(file_path, new_path) command = self.cmd_hmm + (" --dna=%s --out=%s --hmmerv=%s" % (new_path, out_dir, self.hmmerv)) command = command.split() self.processes.add(Popen(command, stdout=PIPE, stderr=PIPE)) if len(self.processes) >= self.max_processes: time.sleep(.1) self.processes.difference_update([p for p in self.processes if p.poll() is not None]) #print dirpath, dirnames, filenames for p in self.processes: if p.poll() is None: p.wait() self.post_processing_after_forward_strand()
def nonltr(self): print 'nonltr: starting' start = time.time() # nonltr #cmd0 = self.base_path + "/nonltr/run_MGEScan.pl \ # -genome=%(genome_dir)s \ # -data=%(data_dir)s \ # -hmmerv=%(hmmerv)s" cmd0 = "python " + self.base_path + "/nonltr/nonltr.py " + \ "%(genome_dir)s " + \ "%(data_dir)s " if self.mpi_enabled: #cmd0 = (cmd0 + " -mpi=%(mpi_enabled)s") cmd0 = (cmd0 + " --mpi=%(mpi_enabled)s") res0 = self.run_cmd(cmd0) # gff3 self.nonltr_out_path = utils.get_abspath(self.data_dir + "/info/full/") self.nonltr_gff_path = utils.get_abspath(self.data_dir + "/info/nonltr.gff3") cmd1 = self.base_path + "/nonltr/toGFF.py %(nonltr_out_path)s %(nonltr_gff_path)s" res1 = self.run_cmd(cmd1) end = time.time() print('nonltr: finishing (elapsed time: {0} secs)'.format( int(round(end - start))))
def backward_strand(self): mypath = self.minus_dir out_dir = self.minus_out_dir for (dirpath, dirnames, filenames) in os.walk(mypath): break for name in filenames: file_path = utils.get_abspath(dirpath + "/" + name) # Rename to sequence id sid = getid(file_path) new_path = utils.get_abspath(dirpath + "/" + sid) os.rename(file_path, new_path) command = self.cmd_hmm + (" --dna=%s --out=%s --hmmerv=%s" % (new_path, out_dir, self.hmmerv)) command = command.split() self.processes.add(Popen(command, stdout=PIPE, stderr=PIPE)) if len(self.processes) >= self.max_processes: time.sleep(.1) self.processes.difference_update( [p for p in self.processes if p.poll() is not None]) #print dirpath, dirnames, filenames for p in self.processes: if p.poll() is None: p.wait() self.post_processing_after_reverse_strand()
def post_processing(self, out_dir, dir, reverse_yn): utils.silentremove(utils.get_abspath(out_dir + "/out1/aaaaa")) utils.silentremove(utils.get_abspath(out_dir + "out1/bbbbb")) utils.silentremove(utils.get_abspath(out_dir + "out1/ppppp")) utils.silentremove(utils.get_abspath(out_dir + "out1/qqqqq")) cmd = self.cmd_post_process + (" --dna=%s --out=%s --rev=%s" % (dir, out_dir, reverse_yn)) self.run_cmd(cmd)
def set_inputs(self): self.data_dir = utils.get_abspath(self.args['--output']) self.genome_dir = utils.get_abspath(self.args['<genome_dir>']) self.ltr_enabled = self.args['ltr'] self.nonltr_enabled = self.args['nonltr'] self.mpi_enabled = self.args['--mpi'] if(self.mpi_enabled and not self.ltr_enabled and not self.nonltr_enabled): self.mpi_enabled = str(int(math.ceil(1.0*int(self.args['--mpi'])/2))) self.debug = self.args['--debug']
def toGFF(self): if self.gff3_enabled: # Assume info is a only directory in genome_dir shutil.move(self.genome_dir + "/info", self.data_dir) # gff3 self.nonltr_out_path = utils.get_abspath(self.data_dir + "/info/full/") self.nonltr_gff_path = utils.get_abspath(self.data_dir + "/info/nonltr.gff3") cmd = self.cmd_togff + " %(nonltr_out_path)s %(nonltr_gff_path)s" res = self.run_cmd(cmd)
def set_inputs(self): self.data_dir = utils.get_abspath(self.args['--output']) self.genome_dir = utils.get_abspath(self.args['<genome_dir>']) self.ltr_enabled = self.args['ltr'] self.nonltr_enabled = self.args['nonltr'] self.mpi_enabled = self.args['--mpi'] if (self.mpi_enabled and not self.ltr_enabled and not self.nonltr_enabled): self.mpi_enabled = str( int(math.ceil(1.0 * int(self.args['--mpi']) / 2))) self.debug = self.args['--debug']
def set_defaults(self): """Set default values to run programs For LTR, min_dist: minimum distance(bp) between LTRs. max_dist: maximum distance(bp) between LTRS min_len_ltr: minimum length(bp) of LTR. max_len_ltr: maximum length(bp) of LTR. ltr_sim_condition: minimum similarity(%) for LTRs in an element. cluster_sim_condition: minimum similarity(%) for LTRs in a cluster len_condition: minimum length(bp) for LTRs aligned in local alignment. """ if self.data_dir: self.data_dir = utils.create_directory(self.data_dir, False) else: self.data_dir = \ utils.create_directory(utils.get_abspath(self.default_output_path)) self.hmmerv = 3 self.min_dist = 2000 self.max_dist = 20000 self.min_len_ltr = 130 self.max_len_ltr = 2000 self.ltr_sim_condition = 70 self.cluster_sim_condition = 70 self.len_condition = 70 self.sw_rm = "No" # or Yes self.scaffold = "" # or directory
def reverse_complement(self, directory): mypath = self.genome_path for (dirpath, dirnames, filenames) in os.walk(mypath): break utils.create_directory(directory, False) for name in filenames: file_path = utils.get_abspath(dirpath + "/" + name) reverse_complement_fasta(file_path, directory)
def reverse_complement(self): mypath = self.genome_dir for (dirpath, dirnames, filenames) in os.walk(mypath): break directory = self.minus_dir if not os.path.exists(directory): os.makedirs(directory) for name in filenames: file_path = utils.get_abspath(dirpath + "/" + name) reverse_complement_fasta(file_path, directory)
def ltr(self): print 'ltr: starting' start = time.time() # scaffold # repeatmasker cmd0 = self.base_path + "/ltr/pre_process.pl \ -genome=%(genome_dir)s \ -data=%(data_dir)s \ -sw_rm=%(sw_rm)s \ -scaffold=%(scaffold)s" res0 = self.run_cmd(cmd0) # find-ltr cmd1 = self.base_path + "/ltr/find_ltr.pl \ -genome=%(genome_dir)s \ -data=%(data_dir)s \ -hmmerv=%(hmmerv)s \ -min_dist=%(min_dist)s \ -max_dist=%(max_dist)s \ -min_len_ltr=%(min_len_ltr)s \ -max_len_ltr=%(max_len_ltr)s \ -ltr_sim_condition=%(ltr_sim_condition)s \ -cluster_sim_condition=%(cluster_sim_condition)s \ -len_condition=%(len_condition)s" if self.mpi_enabled: cmd1 = (cmd1 + " -mpi=%(mpi_enabled)s") res1 = self.run_cmd(cmd1) # gff3 self.ltr_out_path = utils.get_abspath(self.data_dir + "/ltr/ltr.out") self.ltr_gff_path = utils.get_abspath(self.data_dir + "/ltr/ltr.gff3") cmd2 = self.base_path + "/ltr/toGFF.py %(ltr_out_path)s %(ltr_gff_path)s" res2 = self.run_cmd(cmd2) end = time.time() print('ltr: finishing (elapsed time: {0} secs)'.format( int(round(end - start))))
def ltr(self): print 'ltr: starting' start = time.time() # scaffold # repeatmasker cmd0 = self.base_path + "/ltr/pre_process.pl \ -genome=%(genome_dir)s \ -data=%(data_dir)s \ -sw_rm=%(sw_rm)s \ -scaffold=%(scaffold)s" res0 = self.run_cmd(cmd0) # find-ltr cmd1 = self.base_path + "/ltr/find_ltr.pl \ -genome=%(genome_dir)s \ -data=%(data_dir)s \ -hmmerv=%(hmmerv)s \ -min_dist=%(min_dist)s \ -max_dist=%(max_dist)s \ -min_len_ltr=%(min_len_ltr)s \ -max_len_ltr=%(max_len_ltr)s \ -ltr_sim_condition=%(ltr_sim_condition)s \ -cluster_sim_condition=%(cluster_sim_condition)s \ -len_condition=%(len_condition)s" if self.mpi_enabled: cmd1 = (cmd1 + " -mpi=%(mpi_enabled)s") res1 = self.run_cmd(cmd1) # gff3 self.ltr_out_path = utils.get_abspath(self.data_dir + "/ltr/ltr.out") self.ltr_gff_path = utils.get_abspath(self.data_dir + "/ltr/ltr.gff3") cmd2 = self.base_path + "/ltr/toGFF.py %(ltr_out_path)s %(ltr_gff_path)s" res2 = self.run_cmd(cmd2) end = time.time() print ('ltr: finishing (elapsed time: {0} secs)'.format(int(round(end - start))))
def set_input(self, path): self.input_file = utils.get_abspath(path) return self.input_file
def set_inputs(self): self.data_dir = utils.get_abspath(self.args['--output']) self.genome_dir = utils.get_abspath(self.args['<genome_dir>'])
def set_output(self, path): self.result_path = utils.get_abspath(path) return self.result_path
def set_inputs(self): self.args = docopt(self.__doc__, version=self.ver) self.nmpi = self.args['--mpi'] self.genome_path = utils.get_abspath(self.args['<input>']) self.output_path = utils.get_abspath(self.args['<output>'])
def set_datadir(self, path): self.datadir = utils.get_abspath(path) return self.datadir