def _save_aln_results(self, save_dir=None): ''' first parse the blast result file, and then save alignment files into the save_dir. if save_dir is not defined, it does not perform any operations. ''' if self.save_dir == None and save_dir == None: return elif save_dir == None: save_dir = self.save_dir blast = self.parse() self.msas = {} for i, msa in enumerate(blast): iter = i + 1 dir, basename, ext = parse_sequence_filename(self.input) profile_name = build_profile_filename(save_dir, basename, iter, '.aln') profile_fp = open(profile_name, 'w') msa.build_psiblast_alignment_input(profile_fp) profile_fp.close() self.aln_files[iter] = profile_name self.msas[iter] = msa
def __init__(self, input_file=None, output_file=None): if input_file == None: raise TypeError("No Input file is given.") if output_file == None: dir, base_filename, iteration, ext = parse_profile_filename( input_file) output_file = build_profile_filename(dir, base_filename, iteration, ".pnp") self.input_file = input_file self.output_file = output_file
def initial_iteration(self, echo=True): ''' Run initial blast and prepare output ''' #run blast temp_output = os.path.join( self.temp_dir, '%s.1%s' % (self.output_base, self.output_ext)) runner = BLASTRunner(input=self.processed_input, output=temp_output, **self.kwargs) runner.run() #parse blast output blast = BLAST.BLAST(self.processed_input, runner.output) msa = blast[-1] #flagging for combining HSPs for B input alignment. msa.set_combine_hsps() #purge the result if self.use_overlapping_purging: if echo: print('purging overlapping regions...') #build pssm #for initial iteration, BLOSUM 62 matrix is used! pssm = ScoreMat() # len(self.processed_inputfasta) ) pssm.set_blosum_mat() #start to purge the matrix if self.number_of_processors > 1: msa.purge_overlapping_hsps_multithreading( self.inserted_positions, pssm, self.number_of_processors) else: msa.purge_overlapping_hsps(self.inserted_positions, pssm) if self.use_backblast_purging: for neighbor_msa in self.neighboring_msas: #msa.psiblast_purge( neighbor_msa ) backblastpurger = BackblastPurger(msa, neighbor_msa, **self.kwargs) msa_output = build_profile_filename(self.temp_dir, self.output_base, 1, '.aln') msa_output_fp = open(msa_output, 'w') msa.build_psiblast_alignment_input(msa_output_fp) msa_output_fp.close() self.current_runner = runner self.current_msa_output = msa_output self.current_iteration = 1 self.current_parser = blast self.msa_files.append(msa_output) return msa #msa
def __init__(self, cmd=None, input_file=None, output_file=None, calibration_db=None, calibrate=True, calibration_cmd=None): if cmd == None: cmd = Settings.get("hhmake") if input_file == None: raise TypeError("Input_file should be given.") if calibration_cmd == None: calibration_cmd = Settings.get("hhsearch_cmd") if calibration_db == None: calibration_db = Settings.get("hhm_cal_db") if output_file == None: dir, base_filename, iteration, ext = parse_profile_filename( input_file) output_file = build_profile_filename(dir, base_filename, iteration, '.hhm') self.cmd = cmd self.calibration_cmd = calibration_cmd self.calibration_db = calibration_db self.calibrate = calibrate self.input_file = input_file self.output_file = output_file #the following part is added due to hhsearch bug #of cannot handle long input file name handling! self.tmpinput = tempfile.NamedTemporaryFile() self.tmpinputname = self.tmpinput.name try: shutil.copy(self.input_file, self.tmpinputname) except IOError: self.tmpinputname = self.input_file self.tmpoutput = tempfile.NamedTemporaryFile() self.tmpoutputname = self.tmpoutput.name #need to be copied after the execution! self.command_lines = self.get_command_lines()
def next_iteration(self, echo=True): self.current_iteration += 1 self.previous_runner = self.current_runner self.previous_msa_output = self.current_msa_output self.previous_parser = self.current_parser i = self.current_iteration temp_output = os.path.join( self.temp_dir, '%s.%s%s' % (self.output_base, str(i), self.output_ext)) runner = BLASTRunner(input=self.processed_input, output=temp_output, input_alignment=self.previous_msa_output, **self.kwargs) runner.run() blast = BLAST.BLAST(self.processed_input, runner.output) msa = blast[-1] msa.set_combine_hsps() if self.use_overlapping_purging: if echo: print('purging overlapping regions...') pssm = ScoreMat() pssm.build_pssm(self.previous_parser[-1], **self.kwargs) msa.purge_overlapping_hsps(self.inserted_positions, pssm) if self.use_backblast_purging: for neighbor_msa in self.neighboring_msas: #msa.psiblast_purge( neighbor_msa ) backblastpurger = BackblastPurger(msa, neighbor_msa) #make msa output msa_output = build_profile_filename(self.temp_dir, self.output_base, i, '.aln') msa_output_fp = open(msa_output, 'w') msa.build_psiblast_alignment_input(msa_output_fp) msa_output_fp.close() #finally self.current_runner = runner self.current_msa_output = msa_output self.current_parser = blast self.msa_files.append(msa_output) return msa
def set_result_files( self ) : ''' returns list of a3m files in the save_dir. ''' basename = parse_profile_filename( self.sequence )[1] if not basename : return a3m_files = glob.glob( os.path.join( self.save_dir, basename + '*.a3m' ) ) new_a3m_files = [] for a3m in a3m_files : dir, basename2, iteration, ext = parse_profile_filename( a3m ) newa3m = build_profile_filename( dir, basename, iteration, ext ) shutil.move( a3m, newa3m ) new_a3m_files.append( newa3m ) self.a3m_files = new_a3m_files self.a3m_files.sort()
def get_command_line(self): if self.input_type == None: self.input_type = 'a3m' if self.output_type == None: self.output_type = 'psi' if self.output_file == None: dir, base_filename, iteration, ext = parse_profile_filename( self.input_file) self.output_file = build_profile_filename(dir, base_filename, iteration, '.' + self.output_type) command_line = [ self.cmd, self.input_type, self.output_type, self.input_file, self.output_file ] return command_line
def get_command_line(self): if self.output_file == None: dir, base_filename, iteration, ext = parse_profile_filename( self.input_file) if verbose: print("COMPASS Builder file analysis") print('input_file', self.input_file) print("dir:", dir) print("base_filename:", base_filename) print("ieration:", iteration) print("ext:", ext) self.output_file = build_profile_filename(dir, base_filename, iteration, '.cnp') self.temporary_output_fp = tempfile.NamedTemporaryFile() self.temporary_output_file = self.temporary_output_fp.name list_fn = self.prepare_list_file() command_line = [ self.cmd, '-i', list_fn, '-o', self.temporary_output_file ] return command_line
def build_file(self, dominfo): from evdblib.DBTools import Settings filename = os.path.join(self.job_dir, 'prfs%s.job' % dominfo['uniqueid']) if not self.cmd: self.cmd = Settings.get('profile_searcher') command = self.cmd domain_path = dominfo['domain_path'] profile_search_method = self.alignment_method profile_search_queryid = dominfo['uniqueid'] iteration = self.iteration query_iteration = min(self.iteration, check_profile_integrity(dominfo)) profile_search_query = build_profile_filename( domain_path, dominfo['uniqueid'] + '.prof', query_iteration, self.profile_extention) profile_search_db = self.search_db profile_search_db_size = self.search_db_size profile_search_output = build_sequence_filename( domain_path, dominfo['uniqueid'], self.alignment_extention) command += ' -q %(profile_search_queryid)s -j %(iteration)s -m %(profile_search_method)s -u -s %(profile_search_db_size)s -d %(profile_search_db)s %(profile_search_query)s %(profile_search_output)s' % locals( ) if os.path.exists(filename): raise JobScriptWriteError( "Profile search job script %s already exists!" % filename) fp = open(filename, 'w') print(command, file=fp) fp.close() return filename
def _prepare_compass_search_db(db_filename, domain_informations, iteration, use_between, selected_iterations): ''' prepare compass DB. ''' db_fp = open(db_filename, 'w') ext = Settings.get('compass_suffix') db_size_fp = open(db_filename + ".len", 'w') #need to be built compass_db_size = 0 previous_iteration = selected_iterations[max( selected_iterations.index(iteration) - 1, 0)] number_of_records = 0 for dominfo in domain_informations: #read domain path domain_path = dominfo['domain_path'] if not domain_path: if verbose: print("WARNING: Dominfo does not have domain_path...") print(dominfo) continue domid = dominfo['uniqueid'] compass_file = build_profile_filename(domain_path, domid + '.prof', iteration, ext) if not os.path.exists(compass_file) and use_between: #in case the hhsearch file of the iteration #does not exists #and the use_between flag is On... #find the last iteration last_available_iteration = check_profile_integrity(dominfo) if not last_available_iteration: print("WARNING: Profile is bad!", domain_path, domid) continue if last_available_iteration > previous_iteration: compass_file = build_profile_filename( domain_path, domid + '.prof', last_available_iteration, ext) else: if verbose: print("No between iteration available!", iteration, last_available_iteration) continue elif not os.path.exists(compass_file) and not use_between: #when hhsearch file is not availble and use between flag is off. last_available_iteration = check_profile_integrity(dominfo) compass_file = build_profile_filename(domain_path, domid + '.prof', last_available_iteration, ext) #final check! if not os.path.exists(compass_file): #error! print("WARNING: COMPASS file should be available but not found!", compass_file, file=sys.stderr) raise SearchDatabasePreparationError( "COMPASS numerical profile file is not availble!", compass_file) continue fp = open(compass_file) content = fp.read() fp.close() db_fp.write(content) number_of_records += 1 compass_size_file = compass_file + ".len" try: fp = open(compass_size_file) compass_db_size += int(fp.read().strip()) fp.close() except: print("WARNING: Cannot read compass profile size file.", compass_size_file) db_fp.close() print(compass_db_size, file=db_size_fp) db_size_fp.close() return number_of_records
def _prepare_hhsearch_search_db(db_filename, domain_informations, iteration, use_between, selected_iterations): ''' prepare HHsearch DB. and returns the number of records saved in the database file. ''' db_fp = open(db_filename, 'w') ext = Settings.get('hhm_suffix') #getting previous iteration #for selecting iteration bigger than before. previous_iteration = selected_iterations[max( selected_iterations.index(iteration) - 1, 0)] number_of_records = 0 for dominfo in domain_informations: #read domain path if 'domain_path' in dominfo: domain_path = dominfo['domain_path'] else: raise ValueError('domain_path does not exists', dominfo) if not domain_path: if verbose: print("WARNING: Dominfo does not have domain_path...") print(dominfo) continue domid = dominfo['uniqueid'] hhsearch_file = build_profile_filename(domain_path, domid + '.prof', iteration, ext) if not os.path.exists(hhsearch_file) and use_between: #in case the hhsearch file of the iteration #does not exists #and the use_between flag is On... #means the profile is generation is good! #and the value is max iteratoin!! last_available_iteration = check_profile_integrity(dominfo) if not last_available_iteration: print("WARNING: Profile is bad!", domain_path, domid) continue if last_available_iteration > previous_iteration: hhsearch_file = build_profile_filename( domain_path, domid + '.prof', last_available_iteration, ext) else: if verbose: print("WARNING: No between iteration available!", iteration, last_available_iteration) continue elif not os.path.exists(hhsearch_file) and not use_between: #when hhsearch file is not availble and use between flag is off. last_available_iteration = check_profile_integrity(dominfo) hhsearch_file = build_profile_filename(domain_path, domid + '.prof', last_available_iteration, ext) #final check! if not os.path.exists(hhsearch_file): #error! print( "Error: HHsearch HMM file should be available but not found!", hhsearch_file, file=sys.stderr) raise SearchDatabasePreparationError( "HHsearch HHM file is not availble!", hhsearch_file) fp = open(hhsearch_file) content = fp.read() fp.close() db_fp.write(content) number_of_records += 1 db_fp.close() return number_of_records