def test_get_multifasta(self): "Test utils.get_multifasta" expected = """>orf0001 FAKESEQ >orf0003 FAKESEQ >orf0006 FAKESEQ""" ret = utils.get_multifasta(self.rec) self.assertMultiLineEqual(expected, ret)
def run(seq_record, options): "run hmmsearch against PFAM for all CDS features" if 'pfamdir' not in options: options.pfamdir = utils.get_full_path(__file__, '') query_sequence = utils.get_multifasta(seq_record) target_hmmfile = path.join(options.pfamdir, 'Pfam-A.hmm') logging.info('Running whole-genome pfam search') results = utils.run_hmmscan(target_hmmfile, query_sequence) _annotate(seq_record, options, results)
def run(seq_record, options): "run hmmsearch against PFAM for all CDS features" if 'pfamdir' not in options: options.pfamdir = utils.get_full_path(__file__, '') query_sequence = utils.get_multifasta(seq_record) target_hmmfile = path.join(options.pfamdir, 'Pfam-A.hmm') logging.info('Running whole-genome pfam search') if options.skip_cleanup: results_file = path.join(options.full_outputfolder_path, 'fullhmmer.txt') if path.exists(results_file): results = list(SearchIO.parse(results_file, 'hmmer3-text')) else: results = utils.run_hmmscan(target_hmmfile, query_sequence, results_file=results_file) else: results = utils.run_hmmscan(target_hmmfile, query_sequence) _annotate(seq_record, options, results)
def detect_signature_genes(seq_record, enabled_clustertypes, options): "Function to be executed by module" feature_by_id = utils.get_feature_dict(seq_record) full_fasta = utils.get_multifasta(seq_record) rulesdict = create_rules_dict(enabled_clustertypes) results = [] sig_by_name = {} results_by_id = {} for sig in _signature_profiles: sig_by_name[sig.name] = sig runresults = utils.run_hmmsearch(utils.get_full_path( __file__, 'bgc_seeds.hmm'), full_fasta, use_tempfile=True) for runresult in runresults: acc = runresult.accession.split('.')[0] # Store result if it is above cut-off for hsp in runresult.hsps: if hsp.query_id in sig_by_name: sig = sig_by_name[hsp.query_id] elif acc in sig_by_name: sig = sig_by_name[acc] else: logging.error( 'BUG: Failed to find signature for ID %s / ACC %s', hsp.query_id, acc) continue if hsp.bitscore > sig.cutoff: results.append(hsp) if hsp.hit_id not in results_by_id: results_by_id[hsp.hit_id] = [hsp] else: results_by_id[hsp.hit_id].append(hsp) #Get overlap tables (for overlap filtering etc) overlaps = utils.get_overlaps_table(seq_record) #Filter results by comparing scores of different models (for PKS systems) results, results_by_id = filter_results(results, results_by_id) # Filter results of overlapping genes (only for plants) if options.taxon == 'plants': results, results_by_id = filter_result_overlapping_genes( results, results_by_id, overlaps, feature_by_id) #Filter multiple results of the same model in one gene results, results_by_id = filter_result_multiple(results, results_by_id) #Use rules to determine gene clusters typedict = apply_cluster_rules(results_by_id, feature_by_id, enabled_clustertypes, rulesdict, overlaps) #Find number of sequences on which each pHMM is based nseqdict = get_nseq() #Save final results to seq_record for cds in results_by_id.keys(): feature = feature_by_id[cds] _update_sec_met_entry(feature, results_by_id[cds], typedict[cds], nseqdict) find_clusters(seq_record, rulesdict) #Find additional NRPS/PKS genes in gene clusters add_additional_nrpspks_genes(typedict, results_by_id, seq_record, nseqdict) #Add details of gene cluster detection to cluster features store_detection_details(results_by_id, rulesdict, seq_record) #If all-orfs option on, remove irrelevant short orfs if options.all_orfs: remove_irrelevant_allorfs(seq_record)