def main(): if len(sys.argv) != 2: print("Usage: bait_topmatch.py file.fsl") sys.exit(-1) fsl_file = sys.argv[1] total = read_fsl(fsl_file) terms = ontology_common.parse_obo('new_combined.obo') baits_for_class = collections.defaultdict(int) for gene in total.keys(): for cl in ontology_common.get_class(gene, terms): baits_for_class[cl] += total[gene] total_baits = 0 for k, v in baits_for_class.items(): if 'resistance gene' in terms[k]['name'][0]: print(terms[k]['name'][0], v) total_baits += v print("Total counts for gene class ", total_baits) print() total_baits = 0 for k, v in baits_for_class.items(): if 'resistance gene' not in terms[k]['name'][0]: print(terms[k]['name'][0], v) total_baits += v print("Total counts for mechanism ", total_baits)
def main(): if len(sys.argv) != 3: print("Usage: bait_frequency.py file.fsl genes.fa") sys.exit(-1) fsl_file = sys.argv[1] target_to_id = read_fsl(fsl_file) fasta_file = sys.argv[2] genes, name_map = readers.read_fasta(fasta_file, shorten=True, max_length_shorten=False) terms = ontology_common.parse_obo('new_combined.obo') baits_for_class = collections.defaultdict(set) for gene in genes.keys(): name = gene found = name in target_to_id if found: results = target_to_id[name] for result in results: for cl in ontology_common.get_class(result[0], terms): baits_for_class[cl].add(result[1]) #print(name, len(results)) else: print(name, '0') print() total_baits = 0 for k, v in baits_for_class.items(): if 'resistance gene' in terms[k]['name'][0]: print(terms[k]['name'][0], len(v)) total_baits += len(v) print("Total counts for gene class ", total_baits) print() total_baits = 0 for k, v in baits_for_class.items(): if 'resistance gene' not in terms[k]['name'][0]: print(terms[k]['name'][0], len(v)) total_baits += len(v) print("Total counts for mechanism ", total_baits)
def main(): terms = ontology_common.parse_obo('new_combined.obo') with open('mismatch.csv') as f: headers = f.readline() more_general = 0 shared_ancestor = 0 print( "Gene,Expected ID,Found ID,Common Class,Common Class Description") while True: line = f.readline() if not line: break id, actual, name = line.strip().split(',', 2) #id = id.replace('ARO', 'ARO:') #id = id.split('s')[0] #actual = actual.replace('ARO', 'ARO:') #actual = actual.split('s')[0] #id_parents = [p for p in get_lineage(id, terms)] #print(id_parents, [l for l in get_lineage(actual, terms)]) ancestor = determine_ancestor(id, actual, terms) if ancestor == actual: more_general += 1 elif ancestor is None: print('%s,%s,%s,%s,%s' % (name, id, actual, ancestor, "none")) elif "ARO" in ancestor and ancestor != 'ARO:3000000': shared_ancestor += 1 else: #description = terms[actual]['def'][0] if terms.get(actual) else "No Description" print('%s,%s,%s,%s' % (name, id, actual, ancestor)) print("More general:%d, Shared grouping:%d" % (more_general, shared_ancestor))
def hyperbolic_tree(): # outputs json hierarchy used by hyperbolic tree terms = ontology_common.parse_obo('../new_combined.obo') ids = get_ids(sys.argv[1]) print("code_hierarchy_data = ") print(json.dumps(root, indent=4, separators=(',', ': ')))
def main(): if len(sys.argv) != 2: print("Usage: bait_count.py master_amr_parsed.tsv") sys.exit(-1) count_file = sys.argv[1] name_to_count = read_count(count_file) groups = readers.read_grouping('grouping.csv', short=True, map_name=True, strip_colon=False) terms = ontology_common.parse_obo('new_combined.obo') baits_for_class = collections.defaultdict(int) for gene, count in name_to_count.items(): group = groups[gene] for cl in ontology_common.get_class(group, terms): baits_for_class[cl] += count print() total_baits = 0 for k, v in baits_for_class.items(): if 'resistance gene' in terms[k]['name'][0]: print(terms[k]['name'][0], v) total_baits += v print("Total counts for gene class ", total_baits) print() total_baits = 0 for k, v in baits_for_class.items(): if 'resistance gene' not in terms[k]['name'][0]: print(terms[k]['name'][0], v) total_baits += v print("Total counts for mechanism ", total_baits)
def main(): terms = ontology_common.parse_obo('new_combined.obo') with open('mismatch.csv') as f: headers = f.readline() more_general = 0 shared_ancestor = 0 print("Gene,Expected ID,Found ID,Common Class,Common Class Description") while True: line = f.readline() if not line: break id, actual, name = line.strip().split(',', 2) #id = id.replace('ARO', 'ARO:') #id = id.split('s')[0] #actual = actual.replace('ARO', 'ARO:') #actual = actual.split('s')[0] #id_parents = [p for p in get_lineage(id, terms)] #print(id_parents, [l for l in get_lineage(actual, terms)]) ancestor = determine_ancestor(id, actual, terms) if ancestor == actual: more_general += 1 elif ancestor is None: print('%s,%s,%s,%s,%s' % (name, id, actual, ancestor, "none")) elif "ARO" in ancestor and ancestor != 'ARO:3000000': shared_ancestor += 1 else: #description = terms[actual]['def'][0] if terms.get(actual) else "No Description" print('%s,%s,%s,%s' % (name, id, actual, ancestor)) print("More general:%d, Shared grouping:%d" % (more_general, shared_ancestor))
def doughnut(): # outputs json hierarchy with count used by doughnut graph terms = ontology_common.parse_obo('../new_combined.obo') counts = get_counts(sys.argv[1]) root, nodes = build_tree(counts, terms) fill_children(root, nodes, terms) print("code_hierarchy_data = ") print(json.dumps(root, indent=4, separators=(',', ': ')))
def main(): if len(sys.argv) < 3 or len(sys.argv) > 4: print( "Usage: functional_compare.py [-fsl] [-protein] results.scan seq.fasta" ) sys.exit(-1) fsl = True if sys.argv[1] == '-fsl' else False protein = True if sys.argv[1] == '-protein' else False if fsl: fsl_file = sys.argv[2] fasta_file = sys.argv[3] elif protein: scan_file = sys.argv[2] fasta_file = sys.argv[3] else: scan_file = sys.argv[1] fasta_file = sys.argv[2] genes = readers.read_fasta(fasta_file) if fsl: target_to_id = read_fsl(fsl_file) else: id_to_target, target_to_id = readers.read_scan_results(0, scan_file, protein=protein) if protein: readers.change_RF_to_ARO(target_to_id) already_seen_protein = set( ) # We don't want to double count if we have seen the same gene terms = ontology_common.parse_obo('new_combined.obo') false_positive = 0 true_positive = 0 false_negative = 0 for gene in genes.keys(): if protein: names = gene.split('>') gene = names[1].strip() name = names[0].strip() if gene in already_seen_protein: continue else: already_seen_protein.add(gene) else: name = gene found = name in target_to_id if found: antibiotic = gene.split('_')[1] functional_antibiotic = antibiotic_code[antibiotic] results = target_to_id[name] results.sort(key=lambda l: l[1], reverse=True) index = 0 while index < len(results): result = results[index] index += 1 id = result[0] # remove formatting used by hmm if 's' in id: id = id.replace('ARO', 'ARO:') id = id.split('s')[0] if ';' in id: # resfams can have a list of ids associated with a gene classes = [ terms[p]['name'] for i in id.split(';') for p in ontology_common.get_class(i, terms) ] drugs = set() for i in id.split(';'): drugs |= ontology_common.get_resistance( ontology_common.get_lineage(i, terms), terms) else: classes = [ terms[p]['name'] for p in ontology_common.get_class(id, terms) ] drugs = ontology_common.get_resistance( ontology_common.get_lineage(id, terms), terms) identified = False for drug in drugs: for d in ontology_common.get_lineage(drug, terms): for fd in ontology_common.get_lineage( functional_antibiotic[1], terms): if d == fd and d not in [ 'ARO:1000001', 'ARO:1000003', 'Unknown' ]: identified = True if identified: true_positive += 1 break else: false_negative += 1 if found and not identified: print(gene, functional_antibiotic, id, classes, drugs) false_positive += 1 print('False negative: %d; False Positive:%d; True Positive:%d' % (false_negative, false_positive, true_positive))
def main(): terms = ontology_common.parse_obo('new_combined.obo') for l in ontology_common.get_lineage(sys.argv[1], terms): print(l)
# This file prints out all of the CARD entries that are children of the root entry import ontology_common def find_children(parent): for k, v in terms.items(): if 'is_a' in v: for value in v['is_a']: id = value.split() if id[0] == parent: print("'%s': %s," % (k, v['name'])) terms = ontology_common.parse_obo('../aro.obo') find_children('ARO:3000000')
def main(): terms = ontology_common.parse_obo('new_combined.obo') categories(terms)
def main(): if len(sys.argv) < 3 or len(sys.argv) > 4: print("Usage: functional_compare.py [-fsl] [-protein] results.scan seq.fasta") sys.exit(-1) fsl = True if sys.argv[1] == '-fsl' else False protein = True if sys.argv[1] == '-protein' else False if fsl: fsl_file = sys.argv[2] fasta_file = sys.argv[3] elif protein: scan_file = sys.argv[2] fasta_file = sys.argv[3] else: scan_file = sys.argv[1] fasta_file = sys.argv[2] genes = readers.read_fasta(fasta_file) if fsl: target_to_id = read_fsl(fsl_file) else: id_to_target, target_to_id = readers.read_scan_results(0, scan_file, protein=protein) if protein: readers.change_RF_to_ARO(target_to_id) already_seen_protein = set() # We don't want to double count if we have seen the same gene terms = ontology_common.parse_obo('new_combined.obo') false_positive = 0 true_positive = 0 false_negative = 0 for gene in genes.keys(): if protein: names = gene.split('>') gene = names[1].strip() name = names[0].strip() if gene in already_seen_protein: continue else: already_seen_protein.add(gene) else: name = gene found = name in target_to_id if found: antibiotic = gene.split('_')[1] functional_antibiotic = antibiotic_code[antibiotic] results = target_to_id[name] results.sort(key=lambda l: l[1], reverse=True) index = 0 while index < len(results): result = results[index] index += 1 id = result[0] # remove formatting used by hmm if 's' in id: id = id.replace('ARO', 'ARO:') id = id.split('s')[0] if ';' in id: # resfams can have a list of ids associated with a gene classes = [terms[p]['name'] for i in id.split(';') for p in ontology_common.get_class(i, terms)] drugs = set() for i in id.split(';'): drugs |= ontology_common.get_resistance(ontology_common.get_lineage(i, terms), terms) else: classes = [terms[p]['name'] for p in ontology_common.get_class(id, terms)] drugs = ontology_common.get_resistance(ontology_common.get_lineage(id, terms), terms) identified = False for drug in drugs: for d in ontology_common.get_lineage(drug, terms): for fd in ontology_common.get_lineage(functional_antibiotic[1], terms): if d == fd and d not in ['ARO:1000001', 'ARO:1000003', 'Unknown']: identified = True if identified: true_positive += 1 break else: false_negative += 1 if found and not identified: print(gene, functional_antibiotic, id, classes, drugs) false_positive += 1 print('False negative: %d; False Positive:%d; True Positive:%d' % (false_negative, false_positive, true_positive))