def test_get_overlaps_table(self): "Test utils.get_overlaps_table()" mock_features = [ FakeFeature('CDS', FeatureLocation(10, 40), {"locus_tag": ["G1"]}), FakeFeature('CDS', FeatureLocation(40, 50), {"locus_tag": ["G2"]}), FakeFeature('CDS', FeatureLocation(45, 70), {"locus_tag": ["G3"]}), FakeFeature('CDS', FeatureLocation(75, 100), {"locus_tag": ["G4"]}), FakeFeature('CDS', FeatureLocation(101, 110), {"locus_tag": ["G5"]}), ] mock_rec = FakeRecord(mock_features) result = utils.get_overlaps_table(mock_rec) expected = ([[mock_features[0]], [mock_features[1], mock_features[2]], [mock_features[3]], [mock_features[4]]], { 'G5': 3, 'G4': 2, 'G3': 1, 'G2': 1, 'G1': 0 }) self.assertEqual(result, expected, msg=result)
def test_apply_cluster_rules(self): enabled_clustertypes = list(set(self.rulesdict.keys())) detected_types = hmm_detection.apply_cluster_rules( self.results_by_id, self.feature_by_id, enabled_clustertypes, self.rulesdict, utils.get_overlaps_table(self.record)) for gid in detected_types: detected_types[gid] = set(detected_types[gid].split("-")) expected_types = { "GENE_1": set(["MetaboliteA", "MetaboliteB", "MetaboliteC", "MetaboliteD"]), "GENE_2": set(["MetaboliteA", "MetaboliteB", "MetaboliteC", "MetaboliteD"]), "GENE_3": set(["MetaboliteA", "MetaboliteB", "MetaboliteC", "MetaboliteD"]), "GENE_4": set(["MetaboliteA", "MetaboliteB", "MetaboliteC"]), "GENE_5": set(["MetaboliteA", "MetaboliteB", "MetaboliteC"]) } self.assertEqual(detected_types, expected_types, msg="\nResult : %s\nExpected : %s" % (detected_types, expected_types))
def main(): multiprocessing.freeze_support() res_object = {} # get genome files files = [] for line in open(sys.argv[1], 'r'): files.append(path.expanduser(line.replace("\n", ""))) # mockup antismash run per files i = 1 for fpath in files: res_object[fpath] = {} print "Processing %s... (%d/%d)" % (fpath, i, len(files)) i += 1 options = get_mockup_config() options.sequences = [fpath] config.set_config(options) run_antismash.setup_logging( options) #To-DO: get antismash logging to works! # load plugins plugins = run_antismash.load_detection_plugins() run_antismash.filter_plugins(plugins, options, options.enabled_cluster_types) # parse to seq_records seq_records = run_antismash.parse_input_sequences(options) options.next_clusternr = 1 for seq_record in seq_records: if options.input_type == 'nucl': seq_records = [ record for record in seq_records if len(record.seq) > 1000 ] if len(seq_records) < 1: continue utils.sort_features(seq_record) run_antismash.strip_record(seq_record) utils.fix_record_name_id(seq_record, options) # fetch results_by_id feature_by_id = utils.get_feature_dict(seq_record) results = [] results_by_id = {} for feature in utils.get_cds_features(seq_record): prefix = "%s:" % seq_record.id.replace(":", "_") gene_id = utils.get_gene_id(feature) if (prefix + gene_id) in options.hmm_results: results_by_id[gene_id] = options.hmm_results[prefix + gene_id] for res in results_by_id[gene_id]: results.append(res) # ignore short aa's min_length_aa = 100 short_cds_buffer = [] for f in seq_record.features: # temporarily remove short aa if f.type == "CDS" and len( f.qualifiers['translation'] [0]) < min_length_aa and not results_by_id.has_key( utils.get_gene_id(f)): short_cds_buffer.append(f) seq_record.features.remove(f) overlaps = utils.get_overlaps_table(seq_record) rulesdict = hmm_detection.create_rules_dict( options.enabled_cluster_types) # find total cdhit numbers in the chromosome total_cdhit = len( utils.get_cdhit_table(utils.get_cds_features(seq_record))[0]) res_object[fpath][seq_record.id] = { "total_clusters": 0, "total_genes": len(overlaps[0]), "total_cdhit": total_cdhit, "genes_with_hits": 0, "largest_cdhit": 0, "largest_domain_variations": 0, "per_hits": {}, "cluster_types": {} } # filter overlap hits results, results_by_id = hmm_detection.filter_results( results, results_by_id, overlaps, feature_by_id) # count hits for gene_id in results_by_id: res_gene = results_by_id[gene_id] if len(res_gene) > 0: res_object[fpath][seq_record.id]["genes_with_hits"] += 1 for hsp in res_gene: domain_name = hsp.query_id.replace("plants/", "") if domain_name not in res_object[fpath][ seq_record.id]["per_hits"]: res_object[fpath][ seq_record.id]["per_hits"][domain_name] = 0 res_object[fpath][ seq_record.id]["per_hits"][domain_name] += 1 # do cluster finding algorithm typedict = hmm_detection.apply_cluster_rules( results_by_id, feature_by_id, options.enabled_cluster_types, rulesdict, overlaps) hmm_detection.fix_hybrid_clusters_typedict(typedict) nseqdict = hmm_detection.get_nseq() for cds in results_by_id.keys(): feature = feature_by_id[cds] if typedict[cds] != "none": hmm_detection._update_sec_met_entry( feature, results_by_id[cds], typedict[cds], nseqdict) hmm_detection.find_clusters(seq_record, rulesdict, overlaps) seq_record.features.extend(short_cds_buffer) res_object[fpath][seq_record.id]["total_clusters"] += len( utils.get_cluster_features(seq_record)) # do cluster specific and unspecific analysis if len(utils.get_cluster_features(seq_record)) > 0: run_antismash.cluster_specific_analysis( plugins, seq_record, options) run_antismash.unspecific_analysis(seq_record, options) #Rearrange hybrid clusters name alphabetically hmm_detection.fix_hybrid_clusters(seq_record) #before writing to output, remove all hmm_detection's subdir prefixes from clustertype for cluster in utils.get_cluster_features(seq_record): prod_names = [] for prod in cluster.qualifiers['product']: prod_name = [] for name in prod.split('-'): prod_name.append(name.split('/')[-1]) prod_names.append("-".join(prod_name)) cluster.qualifiers['product'] = prod_names for cds in utils.get_cds_features(seq_record): if 'sec_met' in cds.qualifiers: temp_qual = [] for row in cds.qualifiers['sec_met']: if row.startswith('Type: '): clustertypes = [ (ct.split('/')[-1]) for ct in row.split('Type: ')[-1].split('-') ] temp_qual.append('Type: ' + "-".join(clustertypes)) elif row.startswith('Domains detected: '): cluster_results = [] for cluster_result in row.split( 'Domains detected: ')[-1].split(';'): cluster_results.append( cluster_result.split(' (E-value')[0].split( '/')[-1] + ' (E-value' + cluster_result.split(' (E-value')[-1]) temp_qual.append('Domains detected: ' + ";".join(cluster_results)) else: temp_qual.append(row) cds.qualifiers['sec_met'] = temp_qual #on plants, remove plant clustertype from hybrid types, and replace single #plant clustertype with "putative" for cluster in utils.get_cluster_features(seq_record): prod_names = [] for prod in cluster.qualifiers['product']: prod_name = list(set(prod.split('-'))) if (len(prod_name) > 1) and ("plant" in prod_name): prod_name.remove("plant") elif prod_name == ["plant"]: prod_name = ["putative"] prod_names.append("-".join(prod_name)) cluster.qualifiers['product'] = prod_names for cds in utils.get_cds_features(seq_record): if 'sec_met' in cds.qualifiers: temp_qual = [] for row in cds.qualifiers['sec_met']: if row.startswith('Type: '): clustertypes = list( set(row.split('Type: ')[-1].split('-'))) if (len(clustertypes) > 1) and ("plant" in clustertypes): clustertypes.remove("plant") elif clustertypes == ["plant"]: clustertypes = ["putative"] temp_qual.append('Type: ' + "-".join(clustertypes)) else: temp_qual.append(row) cds.qualifiers['sec_met'] = temp_qual # find largest cdhit number & largest domain diversity in a cluster res_object[fpath][seq_record.id]["average_cdhit"] = 0 res_object[fpath][seq_record.id]["average_domain_variations"] = 0 cdhit_numbers = [] domain_numbers = [] for cluster in utils.get_cluster_features(seq_record): cluster_type = utils.get_cluster_type(cluster) if cluster_type not in res_object[fpath][ seq_record.id]["cluster_types"]: res_object[fpath][ seq_record.id]["cluster_types"][cluster_type] = 0 res_object[fpath][ seq_record.id]["cluster_types"][cluster_type] += 1 num_cdhit = len( utils.get_cluster_cdhit_table(cluster, seq_record)) num_domain = len(utils.get_cluster_domains( cluster, seq_record)) cdhit_numbers.append(num_cdhit) domain_numbers.append(num_domain) if num_cdhit > res_object[fpath][ seq_record.id]["largest_cdhit"]: res_object[fpath][ seq_record.id]["largest_cdhit"] = num_cdhit if num_domain > res_object[fpath][ seq_record.id]["largest_domain_variations"]: res_object[fpath][seq_record.id][ "largest_domain_variations"] = num_domain if len(cdhit_numbers) > 0: res_object[fpath][seq_record.id][ "average_cdhit"] = numpy.median(cdhit_numbers) if len(domain_numbers) > 0: res_object[fpath][seq_record.id][ "average_domain_variations"] = numpy.median(domain_numbers) with open('result.js', 'w') as h: h.write('var result = %s;' % json.dumps(res_object, indent=4))
def detect_signature_genes(seq_record, enabled_clustertypes, options): "Function to be executed by module" logging.info('Detecting gene clusters using HMM library') feature_by_id = utils.get_feature_dict(seq_record) rulesdict = create_rules_dict(enabled_clustertypes) results = [] sig_by_name = {} results_by_id = {} for sig in get_sig_profiles(): sig_by_name[sig.name] = sig for feature in utils.get_cds_features(seq_record): prefix = "%s:" % seq_record.id.replace(":", "_") gene_id = utils.get_gene_id(feature) if (prefix + gene_id) in options.hmm_results: results_by_id[gene_id] = options.hmm_results[prefix + gene_id] for res in results_by_id[gene_id]: results.append(res) short_cds_buffer = [] if options.ignore_short_aa: # Temporarily filter out cds with < prot_min_length AA length min_length_aa = 50 if options.eukaryotic: min_length_aa = 100 for f in seq_record.features: if f.type == "CDS" and len( f.qualifiers['translation'] [0]) < min_length_aa and not results_by_id.has_key( utils.get_gene_id(f)): short_cds_buffer.append(f) seq_record.features.remove(f) #Get overlap tables (for overlap filtering etc) overlaps = utils.get_overlaps_table(seq_record) #Filter results by comparing scores of different models (for PKS systems) results_to_delete = [gene_id for gene_id in results_by_id] results, results_by_id = filter_results(results, results_by_id, overlaps, feature_by_id) #Update filtered results back to the options.hmm_results for gene_id in results_by_id: results_to_delete.remove(gene_id) prefix = "%s:" % seq_record.id.replace(":", "_") if (prefix + gene_id) in options.hmm_results: options.hmm_results[(prefix + gene_id)] = results_by_id[gene_id] for gene_id in results_to_delete: prefix = "%s:" % seq_record.id.replace(":", "_") if (prefix + gene_id) in options.hmm_results: del options.hmm_results[(prefix + gene_id)] #Use rules to determine gene clusters typedict = apply_cluster_rules(results_by_id, feature_by_id, enabled_clustertypes, rulesdict, overlaps) #Rearrange hybrid clusters name in typedict alphabetically fix_hybrid_clusters_typedict(typedict) #Find number of sequences on which each pHMM is based nseqdict = get_nseq() #Save final results to seq_record for cds in results_by_id.keys(): feature = feature_by_id[cds] if typedict[cds] != "none": _update_sec_met_entry(feature, results_by_id[cds], typedict[cds], nseqdict) find_clusters(seq_record, rulesdict, overlaps) #Find additional NRPS/PKS genes in gene clusters add_additional_nrpspks_genes(typedict, results_by_id, seq_record, nseqdict) #Rearrange hybrid clusters name alphabetically fix_hybrid_clusters(seq_record) #Add details of gene cluster detection to cluster features store_detection_details(results_by_id, rulesdict, seq_record) # Re-add the short CDSs seq_record.features.extend(short_cds_buffer) utils.sort_features(seq_record) #If all-orfs option on, remove irrelevant short orfs if options.all_orfs: remove_irrelevant_allorfs(seq_record) #Display %identity if options.enable_cdhit: store_percentage_identities(seq_record)
def detect_signature_genes(seq_record, enabled_clustertypes, options): "Function to be executed by module" feature_by_id = utils.get_feature_dict(seq_record) full_fasta = utils.get_multifasta(seq_record) rulesdict = create_rules_dict(enabled_clustertypes) results = [] sig_by_name = {} results_by_id = {} for sig in _signature_profiles: sig_by_name[sig.name] = sig runresults = utils.run_hmmsearch(utils.get_full_path( __file__, 'bgc_seeds.hmm'), full_fasta, use_tempfile=True) for runresult in runresults: acc = runresult.accession.split('.')[0] # Store result if it is above cut-off for hsp in runresult.hsps: if hsp.query_id in sig_by_name: sig = sig_by_name[hsp.query_id] elif acc in sig_by_name: sig = sig_by_name[acc] else: logging.error( 'BUG: Failed to find signature for ID %s / ACC %s', hsp.query_id, acc) continue if hsp.bitscore > sig.cutoff: results.append(hsp) if hsp.hit_id not in results_by_id: results_by_id[hsp.hit_id] = [hsp] else: results_by_id[hsp.hit_id].append(hsp) #Get overlap tables (for overlap filtering etc) overlaps = utils.get_overlaps_table(seq_record) #Filter results by comparing scores of different models (for PKS systems) results, results_by_id = filter_results(results, results_by_id) # Filter results of overlapping genes (only for plants) if options.taxon == 'plants': results, results_by_id = filter_result_overlapping_genes( results, results_by_id, overlaps, feature_by_id) #Filter multiple results of the same model in one gene results, results_by_id = filter_result_multiple(results, results_by_id) #Use rules to determine gene clusters typedict = apply_cluster_rules(results_by_id, feature_by_id, enabled_clustertypes, rulesdict, overlaps) #Find number of sequences on which each pHMM is based nseqdict = get_nseq() #Save final results to seq_record for cds in results_by_id.keys(): feature = feature_by_id[cds] _update_sec_met_entry(feature, results_by_id[cds], typedict[cds], nseqdict) find_clusters(seq_record, rulesdict) #Find additional NRPS/PKS genes in gene clusters add_additional_nrpspks_genes(typedict, results_by_id, seq_record, nseqdict) #Add details of gene cluster detection to cluster features store_detection_details(results_by_id, rulesdict, seq_record) #If all-orfs option on, remove irrelevant short orfs if options.all_orfs: remove_irrelevant_allorfs(seq_record)