def setUp(self): self.args = Pseudoargs( blast_query, blast_in, blast_db, b_type='plain', prediction_method=[ 'rnafold', ], blast_regexp=r'(?<=\|)[A-Z0-9]*\.?\d*$', enable_overwrite=True, mode='simple', html=test_output_file, ) ff, csv = tempfile.mkstemp(prefix='rba_', suffix='_t1') os.close(ff) self.csv = csv ff, json_file = tempfile.mkstemp(prefix='rba_', suffix='_t4') os.close(ff) self.json = json_file ff, fasta = tempfile.mkstemp(prefix='rba_', suffix='_t5') os.close(ff) self.fasta = fasta ff, fasta_structures = tempfile.mkstemp(prefix='rba_', suffix='_t6') os.close(ff) self.fasta_structures = fasta_structures tp = tools_paths( os.path.join(os.path.dirname(os.path.dirname(__file__)), 'rna_blast_analyze', 'BR_core', 'config.txt')) CONFIG.override(tp) rfam = RfamInfo() self.sha1 = compute_args_hash( self.args, os.path.join(CONFIG.rfam_dir, rfam.gzname)) self.test_backup_file = blast_in + '.r-' + self.sha1[:10]
def main(): try: # outer envelope for the script # ========= perform argument parsing ========= if download_name in sys.argv and not ('-q' in sys.argv or '--blast_query' in sys.argv): # run download rfam here # do not run, if given with normal run request from rna_blast_analyze.BR_core.config import tools_paths, CONFIG from rna_blast_analyze.BR_core import cmalign if cfg_name in sys.argv: CONFIG.override( tools_paths(config_file=sys.argv[sys.argv.index(cfg_name) + 1])) cmalign.download_cmmodels_file() # rfam database downloaded sys.exit(0) args = f_parser() _ = lunch_with_args(args) # if we reach here, exit with 0 sys.exit(0) except Exception as e: print('Something went wrong.') try: import traceback print( 'The error traceback is written to rboAnalyzer.log . ' 'Please send it along with the query file and BLAST input to the developers.' ) with open('rboAnalyzer.log', 'w') as fd: fd.write(str(e)) fd.write(traceback.format_exc()) except Exception: pass sys.exit(1)
def extend_meta_core(analyzed_hits, query, args_inner, all_short, multi_query, iteration, ih_model, timeout=None): ml.debug(fname()) # update params if different config is requested CONFIG.override(tools_paths(args_inner.config_file)) blast_args = deepcopy(args_inner) locarna_args = deepcopy(args_inner) b_all_short = deepcopy(all_short) l_all_short = deepcopy(all_short) if args_inner.repredict_file is None: fd, repred_file = mkstemp(prefix='rba_', suffix='_18', dir=CONFIG.tmpdir) os.close(fd) else: repred_file = args_inner.repredict_file for i, args in enumerate([blast_args, locarna_args]): args.prediction_method = [] args.pred_params = dict() args.dump = None args.pdf_out = None args.pandas_dump = None args.repredict_file = repred_file + str(i) args.dev_pred = False args.logfile = None args.json = None args.html = None args.cm_file = ih_model analyzed_hits_simple = deepcopy(analyzed_hits) analyzed_hits_locarna = deepcopy(analyzed_hits) analyzed_hits_simple, _, _, _ = extend_simple_core(analyzed_hits_simple, query, blast_args, b_all_short, multi_query, iteration, ih_model) analyzed_hits_locarna, _, _, _ = extend_locarna_core(analyzed_hits_locarna, query, locarna_args, l_all_short, multi_query, iteration, ih_model, timeout=timeout) # add cmstat to query analyzed_hits.query = analyzed_hits_simple.query order_out = [] b_dict = {BA_support.get_hit_n(h): h for h in analyzed_hits_simple.hits} l_dict = {BA_support.get_hit_n(h): h for h in analyzed_hits_locarna.hits} ok_keys = sorted(set(b_dict.keys()) | set(l_dict.keys())) for inum in ok_keys: bh = b_dict.get(inum, None) lh = l_dict.get(inum, None) hits = [bh, lh] # fallback to simple if locarna returned empty hit # deal with the situation when both ways returned empty hits filtered_hits = [h for h in hits if h is not None] if len(filtered_hits) == 1: msg = 'Only one extension method completed successfully for {}. ' \ 'Choosing the successfully extended sequence to the output.'.format(filtered_hits[0].extension.id) ml.info(msg) if ml.getEffectiveLevel() < 20: print(msg) analyzed_hits.hits.append(filtered_hits[0]) continue elif len(filtered_hits) == 0: # append empty extension analyzed_hits.hits_failed.append(lh) continue bit_scores = [ i.extension.annotations['cmstat']['bit_sc'] for i in hits ] mb = max(bit_scores) bit_index = [i for i, j in enumerate(bit_scores) if j == mb][0] order_out.append(bit_index) analyzed_hits.hits.append(hits[bit_index]) # build failed hits b_dict_failed = { BA_support.get_hit_n(h): h for h in analyzed_hits_simple.hits_failed } l_dict_failed = { BA_support.get_hit_n(h): h for h in analyzed_hits_locarna.hits_failed } for inum in sorted(set(b_dict_failed) | set(l_dict_failed)): if inum not in ok_keys: if inum in b_dict_failed: analyzed_hits.hits_failed.append(b_dict_failed[inum]) elif inum in l_dict_failed: analyzed_hits.hits_failed.append(l_dict_failed[inum]) else: raise KeyError( "Failed to find inum key in failed extensions. This should not happen." ) # build the repredict file here if needed if args_inner.repredict_file: b_repredict = BA_support.iter2file_name(blast_args.repredict_file, multi_query, iteration) l_repredict = BA_support.iter2file_name(blast_args.repredict_file, multi_query, iteration) o_repredict = BA_support.iter2file_name(args_inner.repredict_file, multi_query, iteration) with open(b_repredict, 'r') as barf, open(l_repredict, 'r') as larf, open(o_repredict, 'w') as reprf: """ please note that order of files to merge must be same as the order of methods in previous for cycle ie same as the one in which order_out var is set """ bb = (barf, larf) fl = bb[0].readline() reprf.write(fl) fl = bb[0].readline() reprf.write(fl) # dump first line of the other documents [[i.readline() for _ in range(1)] for i in bb[1:]] for o in order_out: lll = [i.readline() for i in bb] reprf.write(lll[o]) # recreate needed data from selected hits homology_prediction = [] homol_seqs = [] for hit in analyzed_hits.hits: homology_prediction.append(hit.hpred) if hit.hpred: homol_seqs.append(hit.extension) # add default prediction if it is not present if 'ss0' not in hit.extension.letter_annotations: if 'sss' not in hit.extension.annotations: hit.extension.anotations['sss'] = [] hit.extension.annotations['sss'] += ['ss0'] hit.extension.letter_annotations['ss0'] = '.' * len( hit.extension.seq) # recreate needed data from selected hits homology_prediction = [] homol_seqs = [] for hit in analyzed_hits.hits: homology_prediction.append(hit.hpred) if hit.hpred: homol_seqs.append(hit.extension) # add default prediction if it is not present if 'ss0' not in hit.extension.letter_annotations: if 'sss' not in hit.extension.annotations: hit.extension.anotations['sss'] = [] hit.extension.annotations['sss'] += ['ss0'] hit.extension.letter_annotations['ss0'] = '.' * len( hit.extension.seq) # remove description from hits and sources for hit in analyzed_hits.hits: hit.extension.description = '' if args_inner.cm_file or args_inner.use_rfam: cm_file_rfam_user = ih_model else: cm_file_rfam_user = None BA_support.remove_one_file_with_try(ih_model) return analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user
def lunch_with_args(args): # ========= imports ========== # move slow imports here, so the argcomplete would be fast import logging from rna_blast_analyze.BR_core import BA_verify from rna_blast_analyze.BR_core import cmalign from rna_blast_analyze.BR_core.config import tools_paths, CONFIG from rna_blast_analyze.BR_core.validate_args import validate_args, compute_args_hash from rna_blast_analyze.BR_core.luncher import lunch_computation from rna_blast_analyze.BR_core.convert_classes import blastsearchrecompute2dict from rna_blast_analyze.BR_core.cmalign import RfamInfo logger = logging.getLogger('rboAnalyzer') ch = logging.StreamHandler() formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s') ch.setFormatter(formatter) logger.addHandler(ch) # set logger level logger.setLevel(max(3 - args.verbose, 1) * 10) logger.debug('parsed arguments: {}'.format(args)) # create logging file if requested if args.logfile: fh = logging.FileHandler(args.logfile) formatter = logging.Formatter( '%(asctime)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) logger.addHandler(fh) start_msg = 'STATUS: starting rboAnalyzer...' print(start_msg) logger.info(start_msg) logger.info('BLAST file: {}'.format(args.blast_in)) logger.info('Query file: {}'.format(args.blast_query)) logger.info('BLAST db: {}'.format(args.blast_db)) if args.config_file: logger.info('configfile: {}'.format(args.config_file)) # ========= load optional cfg file ========= CONFIG.override(tools_paths(config_file=args.config_file)) # ========= check rfam ========= if not args.download_rfam and not cmalign.check_rfam_present(): msgfail = 'RFAM models file is not present in specified path. ' \ 'Please enable rfam download (--download_rfam) or provide prepared directory.' logger.error(msgfail) sys.exit(1) if args.download_rfam: cmalign.download_cmmodels_file() # ========= check if tools needed for requested methods are installed ========= BA_verify.check_necessery_tools(methods=args.prediction_method + [args.mode]) # ========= check if parameters make sense ========= if not validate_args(args): print( "There was an error with provided arguments. Please see the error message." ) sys.exit(1) # ========= compute args hash ========= rfam = RfamInfo() hashstring = compute_args_hash(args, os.path.join(rfam.rfam_dir, rfam.gzname)) setattr(args, 'sha1', hashstring) # ========= run ========= blast_fn = os.path.basename(args.blast_in) + '.r-' + hashstring[:10] blast_dir = os.path.dirname(args.blast_in) if blast_dir == '': blast_dir = os.getcwd() potential_matches = [f for f in os.listdir(blast_dir) if f == blast_fn] if len(potential_matches) == 0: with open(args.blast_in + '.r-' + args.sha1[:10], 'w') as f: json.dump(None, f) _, results = lunch_computation(args) with open(os.path.join(blast_dir, blast_fn), 'w') as f: json.dump([blastsearchrecompute2dict(r) for r in results], f, indent=2) return results
def test_config_override(self): rfam_dir = '/test/test/Documents/rfamdb/' self.assertNotEqual(CONFIG.rfam_dir, rfam_dir) CONFIG.override( tools_paths(os.path.join(fwd, 'test_data', 'config_test.txt'))) self.assertEqual(CONFIG.rfam_dir, rfam_dir)
def lunch_computation(args_inner, shared_list=None): ml.debug(fname()) if not shared_list: shared_list = [] # update params if different config is requested CONFIG.override(tools_paths(args_inner.config_file)) p_blast = BA_support.blast_in(args_inner.blast_in, b=args_inner.b_type) query_seqs = [i for i in SeqIO.parse(args_inner.blast_query, 'fasta')] if len(p_blast) != len(query_seqs): ml.error( 'Number of query sequences in provided BLAST output file ({}) does not match number of query sequences' ' in query FASTA file ({}).'.format(len(p_blast), len(query_seqs))) sys.exit(1) # check if BLAST does not contain unexpected sequence characters validate_args.check_blast(p_blast) # create list of correct length if needed all_saved_data = [None] * len(query_seqs) saved_file = '{}.r-{}'.format(args_inner.blast_in, args_inner.sha1[:10]) with open(saved_file, 'r+') as f: _saved = json.load(f) if _saved is None: f.seek(0) f.truncate() json.dump(all_saved_data, f) else: msg = "Loading backup data." print('STATUS: ' + msg) ml.info(msg + ' file: ' + saved_file) all_saved_data = _saved for saved_data in all_saved_data: # we can have partially computed data if saved_data is None: continue if saved_data['args']['sha1'] != args_inner.sha1: msg = "Input argument hash does not match the saved argument hash. " if saved_data['args']['sha1'][:10] == args_inner.sha1[:10]: msg += "This is because of truncating hashes to first 10 characters. " msg += "Please remove the '{}' file.".format( saved_file) ml.error(msg) sys.exit(1) else: msg += "Please remove the '{}' file.".format( saved_file) sys.exit(1) if len(p_blast) > 1: multi_query = True else: multi_query = False # this is done for each query ml_out_line = [] all_analyzed = [] for iteration, (bhp, query, saved_data) in enumerate( zip(p_blast, query_seqs, all_saved_data)): if saved_data is None: print('STATUS: processing query: {}'.format(query.id)) validate_args.verify_query_blast(blast=bhp, query=query) analyzed_hits = BlastSearchRecompute(args_inner, query, iteration) analyzed_hits.multi_query = multi_query # run cm model build # allows to fail fast if rfam was selected and we dont find the model ih_model, analyzed_hits = find_and_extract_cm_model( args_inner, analyzed_hits) # select all all_blast_hits = BA_support.blast_hsps2list(bhp) if len(all_blast_hits) == 0: ml.error('No hits found in {} - {}. Nothing to do.'.format( args_inner.blast_in, bhp.query)) continue # filter if needed if args_inner.filter_by_eval is not None: tmp = filter_by_eval(all_blast_hits, BA_support.blast_hit_getter_from_hits, args_inner.filter_by_eval) if len(tmp) == 0 and len(all_blast_hits) != 0: ml.error( 'The requested filter removed all BLAST hits {} - {}. Nothing to do.' .format(args_inner.blast_in, bhp.query)) continue elif args_inner.filter_by_bitscore is not None: tmp = filter_by_bits(all_blast_hits, BA_support.blast_hit_getter_from_hits, args_inner.filter_by_bitscore) if len(tmp) == 0 and len(all_blast_hits) != 0: ml.error( 'The requested filter removed all BLAST hits {} - {}. Nothing to do.' .format(args_inner.blast_in, bhp.query)) continue all_short = all_blast_hits # now this is different for each mode if args_inner.mode == 'simple': analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user = extend_simple_core( analyzed_hits, query, args_inner, all_short, multi_query, iteration, ih_model) elif args_inner.mode == 'locarna': analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user = extend_locarna_core( analyzed_hits, query, args_inner, all_short, multi_query, iteration, ih_model) elif args_inner.mode == 'meta': analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user = extend_meta_core( analyzed_hits, query, args_inner, all_short, multi_query, iteration, ih_model) else: raise ValueError( 'Unknown option - should be cached by argparse.') if len(analyzed_hits.hits) == 0: ml.error( "Extension failed for all sequences. Please see the error message. You can also try '--mode simple'." ) sys.exit(1) analyzed_hits.copy_hits() with open(args_inner.blast_in + '.r-' + args_inner.sha1[:10], 'r+') as f: all_saved_data = json.load(f) all_saved_data[iteration] = blastsearchrecompute2dict( analyzed_hits) f.seek(0) f.truncate() json.dump(all_saved_data, f, indent=2) else: print( 'STATUS: extended sequences loaded from backup file for query {}' .format(query.id)) analyzed_hits = blastsearchrecomputefromdict(saved_data) # overwrite the saved args with current # this will update used prediction methods and other non essential stuff analyzed_hits.args = args_inner if analyzed_hits.args.cm_file: cm_file_rfam_user = analyzed_hits.args.cm_file else: cm_file_rfam_user = None all_analyzed.append(analyzed_hits) # write all hits to fasta fda, all_hits_fasta = mkstemp(prefix='rba_', suffix='_22', dir=CONFIG.tmpdir) os.close(fda) analyzed_hits.write_results_fasta(all_hits_fasta) out_line = [] # multiple prediction params if args_inner.dev_pred: dp_list = [] # acomodate more dev pred outputs dpfile = None if getattr(args_inner, 'dump', False): dpfile = args_inner.dump.strip('dump') if getattr(args_inner, 'pandas_dump', False): dpfile = args_inner.pandas_dump.strip('pandas_dump') if getattr(args_inner, 'json', False): dpfile = args_inner.json.strip('json') # optimization so the rfam cm file is used only once if cm_file_rfam_user is None and 'rfam' in ''.join( args_inner.prediction_method): best_model = get_cm_model(args_inner.blast_query, threads=args_inner.threads) rfam = RfamInfo() cm_file_rfam_user = run_cmfetch(rfam.file_path, best_model) for method in args_inner.prediction_method: # cycle the prediction method settings # get set of params for each preditcion selected_pred_params = [ kk for kk in args_inner.pred_params if method in kk ] shuffle(selected_pred_params) # for method_params in args_inner.pred_params: for i, method_params in enumerate(selected_pred_params): ah = deepcopy(analyzed_hits) random_flag = BA_support.generate_random_name( 8, shared_list) shared_list.append(random_flag) pname = re.sub(' ', '', str(method)) flag = '|pred_params|' + random_flag # rebuild the args only with actualy used prediction settings ah.args.prediction_method = method ah.args.pred_params = method_params if getattr(args_inner, 'dump', False): spa = args_inner.dump.split('.') ah.args.dump = '.'.join( spa[:-1]) + flag + '.' + spa[-1] if getattr(args_inner, 'pandas_dump', False): spa = args_inner.pandas_dump.split('.') ah.args.pandas_dump = '.'.join( spa[:-1]) + flag + '.' + spa[-1] if getattr(args_inner, 'pdf_out', False): spa = args_inner.pdf_out.split('.') ah.args.pdf_out = '.'.join( spa[:-1]) + flag + '.' + spa[-1] if getattr(args_inner, 'json', False): spa = args_inner.json.split('.') ah.args.json = '.'.join( spa[:-1]) + flag + '.' + spa[-1] wrapped_ending_with_prediction( args_inner=ah.args, analyzed_hits=ah, pred_method=method, method_params=method_params, used_cm_file=cm_file_rfam_user, multi_query=multi_query, iteration=iteration, ) success = True out_line.append(to_tab_delim_line_simple(ah.args)) dp_list.append((i, method_params, success, flag, pname, random_flag, args_inner.pred_params)) if dpfile is not None: with open(dpfile + 'devPredRep', 'wb') as devf: pickle.dump(dp_list, devf) else: wrapped_ending_with_prediction( args_inner=args_inner, analyzed_hits=analyzed_hits, used_cm_file=cm_file_rfam_user, multi_query=multi_query, iteration=iteration, ) out_line.append(to_tab_delim_line_simple(args_inner)) ml_out_line.append('\n'.join(out_line)) if cm_file_rfam_user is not None and os.path.exists(cm_file_rfam_user): BA_support.remove_one_file_with_try(cm_file_rfam_user) BA_support.remove_one_file_with_try(all_hits_fasta) return '\n'.join(ml_out_line), all_analyzed