def read_fsl(filename): groups = readers.read_grouping('grouping.csv', short=True, map_name=True, strip_colon=False) bait_to_match = {} with open(filename) as f: lines = f.readlines() skipped_matches = 0 for line in lines[5:]: tokens = line.strip('\n').split('\t') score = int(tokens[0]) gene = tokens[9] bait = tokens[13] if bait in bait_to_match: if score > bait_to_match[bait][0]: bait_to_match[bait] = (score, gene) else: bait_to_match[bait] = (score, gene) total = collections.defaultdict(int) for s, g in bait_to_match.values(): if g in groups: total[groups[g]] += 1 else: total['unknown'] += 1 return total
def main(): if len(sys.argv) != 2: print("Usage: bait_count.py master_amr_parsed.tsv") sys.exit(-1) count_file = sys.argv[1] name_to_count = read_count(count_file) groups = readers.read_grouping('grouping.csv', short=True, map_name=True, strip_colon=False) terms = ontology_common.parse_obo('new_combined.obo') baits_for_class = collections.defaultdict(int) for gene, count in name_to_count.items(): group = groups[gene] for cl in ontology_common.get_class(group, terms): baits_for_class[cl] += count print() total_baits = 0 for k, v in baits_for_class.items(): if 'resistance gene' in terms[k]['name'][0]: print(terms[k]['name'][0], v) total_baits += v print("Total counts for gene class ", total_baits) print() total_baits = 0 for k, v in baits_for_class.items(): if 'resistance gene' not in terms[k]['name'][0]: print(terms[k]['name'][0], v) total_baits += v print("Total counts for mechanism ", total_baits)
def main(): if len(sys.argv) != 4: print("Usage: recluster.py grouping.csv cdhit output.csv") sys.exit(0) id_to_name = readers.read_grouping(sys.argv[1], short=True) new_name_to_id = {} for id, names in id_to_name.items(): try: clstr_id_to_name = readers.read_cluster("%s/%s.clstr" % (sys.argv[2], id)) except IOError: # work around bug in cdhit where it cannot deal with one sequence clstr_id_to_name = {0: ['ENA|pgpA/ltpgpA|CAA']} for clstr_id, names in clstr_id_to_name.items(): new_id = 'ARO:%ss%s' % (id[3:], clstr_id) for name in names: new_name_to_id[name] = new_id with open(sys.argv[1]) as f: lines = f.readlines() with open(sys.argv[3], 'w+') as new_grouping: for line in lines: line = line.rstrip('\n') name, id = line.rsplit(',', 1) short_name = name.strip('"').strip() short_name = short_name.split( )[0] if ' ' in short_name else short_name if len(short_name) > 19: short_name = short_name[:19] new_grouping.write('%s,%s\n' % (name, new_name_to_id[short_name]))
def main(): if len(sys.argv) != 4: print("Usage: recluster.py grouping.csv cdhit output.csv") sys.exit(0) id_to_name = readers.read_grouping(sys.argv[1], short=True) new_name_to_id = {} for id, names in id_to_name.items(): try: clstr_id_to_name = readers.read_cluster("%s/%s.clstr" % (sys.argv[2], id)) except IOError: # work around bug in cdhit where it cannot deal with one sequence clstr_id_to_name = {0: ['ENA|pgpA/ltpgpA|CAA']} for clstr_id, names in clstr_id_to_name.items(): new_id = 'ARO:%ss%s' % (id[3:], clstr_id) for name in names: new_name_to_id[name] = new_id with open(sys.argv[1]) as f: lines = f.readlines() with open(sys.argv[3], 'w+') as new_grouping: for line in lines: line = line.rstrip('\n') name, id = line.rsplit(',', 1) short_name = name.strip('"').strip() short_name = short_name.split()[0] if ' ' in short_name else short_name if len(short_name) > 19: short_name = short_name[:19] new_grouping.write('%s,%s\n' % (name, new_name_to_id[short_name]))
def read_fsl(filename): groups = readers.read_grouping('grouping.csv', short=True, map_name=True, strip_colon=False) query_to_target = {} with open(filename) as f: lines = f.readlines() for line in lines[1:]: tokens = line.strip('\n').split('\t') group = groups[tokens[1]] if tokens[0] in query_to_target: query_to_target[tokens[0]].append((group, tokens[2])) else: query_to_target[tokens[0]] = [(group, tokens[2])] return query_to_target
def read_fsl(filename): groups = readers.read_grouping('grouping.csv', short=True, map_name=True, strip_colon=False) query_to_target = {} with open(filename) as f: lines = f.readlines() skipped_matches = 0 for line in lines[5:]: tokens = line.strip('\n').split('\t') if int(tokens[0]) < 120 * .4: skipped_matches += 1 continue if tokens[9] in groups: group = groups[tokens[9]] else: group = "Unknown" if tokens[9] in query_to_target: query_to_target[tokens[9]].append((group, tokens[13])) else: query_to_target[tokens[9]] = [(group, tokens[13])] print("matches lower that 40%", skipped_matches) return query_to_target
def main(): if len(sys.argv) != 6: print("Usage: art_test.py grouping.csv scanresults.scan art_out.maf scores.csv genemark.f") sys.exit(0) genemark_to_name = read_genemark(sys.argv[5]) id_to_name = readers.read_grouping(sys.argv[1]) thresh = [] fp = [] tp = [] nf = [] contig_to_read = readers.read_maf(sys.argv[3]) for threshold in range(0, 80, 100): mismatch = [] scan_id_to_name, scan_name_to_id = readers.read_scan_results(threshold, sys.argv[2],protein=True) readers.change_RF_to_ARO(scan_name_to_id) scan_name_to_id = substitute_read_name(scan_name_to_id, contig_to_read, genemark_to_name) found_score = [] true_positive = 0 false_positive = 0 not_found = 0 total = 0 for id in sorted(id_to_name.keys()): canonical_id = id.split('s')[0] canonical_id = 'ARO:' + canonical_id.split('O')[1] for name in id_to_name[id]: total += 1 ids = search(name, id, scan_name_to_id) if ids: if ids[0][0] == canonical_id: true_positive += 1 found_score.append((ids[0][1], id, name)) else: if ';' in ids[0][0] and canonical_id in ids[0][0].split(';'): true_positive += 1 found_score.append((ids[0][1], id, name)) else: #print("False Positive: %s, %s" % (name, id)) mismatch.append((name, canonical_id, ids[0][0], ids[0][2])) for i in range(len(ids)): #print("Attempt %d: %s %f" % (i, ids[i][0], ids[i][1])) if ids[i][0] == canonical_id: found_score.append((ids[i][1], id, name)) break false_positive += 1 else: #for k,v in contig_to_read.items(): # if name in v: # #print("Found read %s in %s" % (name, k)) #print("Not Found: %s,%s" % (name, id)) not_found += 1 thresh.append(threshold) tp.append(true_positive) fp.append(false_positive) nf.append(not_found) print("misclassified %d notfound %d true positive %d of total %d" % (false_positive, not_found, true_positive, total)) print(threshold, true_positive, false_positive, not_found, total) x = np.array(fp)# false_positive_rate y = np.array(tp)# true_positive_rate t = np.array(thresh) n = np.array(nf) found_score.sort() with open(sys.argv[4], 'w') as f: for score, hmm, name in found_score: f.write('%s,%d\n' % (hmm, score - 1)) with open('mismatch.csv', 'w') as f: f.write('Gene,Expected ID,Found ID\n') for a, b, c, d in mismatch: f.write('%s,%s,%s\n' % (b, c, a)) test.graph(x, y, t, n)
output = ''.join([gen_random_sequence(prefix_len), seq, gen_random_sequence(suffix_len)]) return "> %d:%d?%s?%s\n%s" %(prefix_len, len(seq) + prefix_len, target, name, output) def unit_test(): pattern = 'AAAAAA' value = gen_test(pattern, 'adr001', 'test') lines = value.split('\n') offset, target, name = lines[0][1:].split('?', 2) start, end = offset.split(':') start = int(start) end = int(end) if lines[1][start:end] == pattern: print("Success") else: print(lines[1][start:end]) #unit_test() id_to_name = readers.read_grouping() name_to_seq = readers.read_fasta() for id, names in id_to_name.items(): with open('../test/%s.fa' % id, 'w+') as fasta: for name in names: seq = name_to_seq[name] value = gen_test(seq, id, name) fasta.write(value + '\n')
def main(): if len(sys.argv) != 5: print( "Usage: art_test.py grouping.csv scanresults.scan art_out.maf scores.csv" ) sys.exit(0) id_to_name = readers.read_grouping(sys.argv[1]) thresh = [] fp = [] tp = [] nf = [] contig_to_read = readers.read_maf(sys.argv[3]) for threshold in range(0, 81, 100): mismatch = [] scan_id_to_name, scan_name_to_id = readers.read_scan_results( threshold, sys.argv[2]) scan_name_to_id = substitute_read_name(scan_name_to_id, contig_to_read) found_score = [] true_positive = 0 false_positive = 0 not_found = 0 total = 0 for id in sorted(id_to_name.keys()): for name in id_to_name[id]: total += 1 ids = search(name, id, scan_name_to_id) if ids: if ids[0][0] == id or ids[0][0].split('s')[0] == id.split( 's')[0]: true_positive += 1 found_score.append((ids[0][1], id, name)) else: #print("False Positive: %s, %s" % (name, id)) canonical_id = 'ARO:' + id.split('s')[0].split('O')[1] canonical_id2 = 'ARO:' + ids[0][0].split('s')[0].split( 'O')[1] mismatch.append( (name, canonical_id, canonical_id2, ids[0][2])) for i in range(len(ids)): #print("Attempt %d: %s %f" % (i, ids[i][0], ids[i][1])) if ids[i][0] == id: found_score.append((ids[i][1], id, name)) break false_positive += 1 else: for k, v in contig_to_read.items(): if name in v: print("Found read %s in %s" % (name, k)) print("Not Found: %s,%s" % (name, id)) not_found += 1 thresh.append(threshold) tp.append(true_positive) fp.append(false_positive) nf.append(not_found) print("misclassified %d notfound %d true positive %d of total %d" % (false_positive, not_found, true_positive, total)) print(threshold, true_positive, false_positive, not_found, total) x = np.array(fp) # false_positive_rate y = np.array(tp) # true_positive_rate t = np.array(thresh) n = np.array(nf) found_score.sort() with open(sys.argv[4], 'w') as f: for score, hmm, name in found_score: f.write('%s,%d\n' % (hmm, score - 1)) with open('mismatch.csv', 'w') as f: f.write('Gene,Expected ID,Found ID\n') for a, b, c, d in mismatch: f.write('%s,%s,%s\n' % (b, c, a)) test.graph(x, y, t, n)
def main(): if len(sys.argv) != 4: print("Usage: test.py grouping.csv scanresults.scan threshold.csv") sys.exit(0) id_to_name = readers.read_grouping(sys.argv[1], short=True) thresh = [] fp = [] tp = [] nf = [] for threshold in range(0, 10, 100): scan_id_to_name, scan_name_to_id = readers.read_scan_results(threshold, sys.argv[2]) found_score = [] mismatch = [] true_positive = 0 false_positive = 0 not_found = 0 total = 0 for id in id_to_name.keys(): for name in id_to_name[id]: total += 1 if name not in scan_name_to_id: print("Not found %s,%s" % (name, id)) not_found += 1 continue ids = scan_name_to_id[name] ids.sort(key=lambda l: l[1], reverse=True) if ids[0][0] == id or ids[0][0].split('s')[0] == id.split('s')[0]: true_positive += 1 found_score.append((ids[0][1], id, name)) else: print("False Positive: %s, %s" % (name, id)) if len(ids) > 1 and ids[1][0] == id: mismatch.append((name, id, ids[0][0])) else: for i in range(len(ids)): print("Attempt %d: %s %f" % (i, ids[i][0], ids[i][1])) if ids[i][0] == id: found_score.append((ids[i][1], id, name)) break else: mismatch.append((name, id, ids[i][0])) false_positive += 1 thresh.append(threshold) tp.append(true_positive) fp.append(false_positive) nf.append(not_found) x = np.array(fp)# false_positive_rate y = np.array(tp)# true_positive_rate t = np.array(thresh) n = np.array(nf) total = x + y + n found_score.sort() with open(sys.argv[3], 'w') as f: for score, hmm, name in found_score: f.write('%s,%d\n' % (hmm, score - 1)) with open('mismatch.csv', 'w') as f: f.write('Gene,Expected ID,Found ID\n') for a, b, c in mismatch: f.write('%s,%s,%s\n' % (a, b, c)) print("misclassified %d notfound %d true positive %d of total %d" % (x, n, y, total))
import readers import sys # read input files name_to_sequence = readers.read_fasta(sys.argv[1]) id_to_name = readers.read_grouping(sys.argv[2]) # create fasta files for each id readers.create_fasta_file_for_each_id(name_to_sequence, id_to_name, sys.argv[3])