Esempio n. 1
0
def read_fsl(filename):
    groups = readers.read_grouping('grouping.csv',
                                   short=True,
                                   map_name=True,
                                   strip_colon=False)
    bait_to_match = {}
    with open(filename) as f:
        lines = f.readlines()
    skipped_matches = 0
    for line in lines[5:]:
        tokens = line.strip('\n').split('\t')
        score = int(tokens[0])
        gene = tokens[9]
        bait = tokens[13]
        if bait in bait_to_match:
            if score > bait_to_match[bait][0]:
                bait_to_match[bait] = (score, gene)
        else:
            bait_to_match[bait] = (score, gene)
    total = collections.defaultdict(int)
    for s, g in bait_to_match.values():
        if g in groups:
            total[groups[g]] += 1
        else:
            total['unknown'] += 1
    return total
Esempio n. 2
0
def main():
    if len(sys.argv) != 2:
        print("Usage: bait_count.py master_amr_parsed.tsv")
        sys.exit(-1)
    count_file = sys.argv[1]
    name_to_count = read_count(count_file)
    groups = readers.read_grouping('grouping.csv', short=True, map_name=True, strip_colon=False)
    terms = ontology_common.parse_obo('new_combined.obo')
    baits_for_class = collections.defaultdict(int)
    for gene, count in name_to_count.items():
        group = groups[gene]
        for cl in ontology_common.get_class(group, terms):
            baits_for_class[cl] += count
    print()
    total_baits = 0
    for k, v in baits_for_class.items():
        if 'resistance gene' in terms[k]['name'][0]:
            print(terms[k]['name'][0], v)
            total_baits += v
    print("Total counts for gene class ", total_baits)
    print()
    total_baits = 0
    for k, v in baits_for_class.items():
        if 'resistance gene' not in terms[k]['name'][0]:
            print(terms[k]['name'][0], v)
            total_baits += v
    print("Total counts for mechanism ", total_baits)
Esempio n. 3
0
def main():
    if len(sys.argv) != 4:
        print("Usage: recluster.py grouping.csv cdhit output.csv")
        sys.exit(0)

    id_to_name = readers.read_grouping(sys.argv[1], short=True)
    new_name_to_id = {}
    for id, names in id_to_name.items():
        try:
            clstr_id_to_name = readers.read_cluster("%s/%s.clstr" %
                                                    (sys.argv[2], id))
        except IOError:
            # work around bug in cdhit where it cannot deal with one sequence
            clstr_id_to_name = {0: ['ENA|pgpA/ltpgpA|CAA']}
        for clstr_id, names in clstr_id_to_name.items():
            new_id = 'ARO:%ss%s' % (id[3:], clstr_id)
            for name in names:
                new_name_to_id[name] = new_id

    with open(sys.argv[1]) as f:
        lines = f.readlines()
    with open(sys.argv[3], 'w+') as new_grouping:
        for line in lines:
            line = line.rstrip('\n')
            name, id = line.rsplit(',', 1)
            short_name = name.strip('"').strip()
            short_name = short_name.split(
            )[0] if ' ' in short_name else short_name
            if len(short_name) > 19:
                short_name = short_name[:19]
            new_grouping.write('%s,%s\n' % (name, new_name_to_id[short_name]))
Esempio n. 4
0
def main():
    if len(sys.argv) != 4:
        print("Usage: recluster.py grouping.csv cdhit output.csv")
        sys.exit(0)

    id_to_name = readers.read_grouping(sys.argv[1], short=True)
    new_name_to_id = {}
    for id, names in id_to_name.items():
        try:
            clstr_id_to_name = readers.read_cluster("%s/%s.clstr" % (sys.argv[2], id))
        except IOError:
            # work around bug in cdhit where it cannot deal with one sequence
            clstr_id_to_name = {0: ['ENA|pgpA/ltpgpA|CAA']}
        for clstr_id, names in clstr_id_to_name.items():
            new_id = 'ARO:%ss%s' % (id[3:], clstr_id)
            for name in names:
                new_name_to_id[name] = new_id

    with open(sys.argv[1]) as f:
        lines = f.readlines()
    with open(sys.argv[3], 'w+') as new_grouping:
        for line in lines:
            line = line.rstrip('\n')
            name, id = line.rsplit(',', 1)
            short_name = name.strip('"').strip()
            short_name = short_name.split()[0] if ' ' in short_name else short_name
            if len(short_name) > 19:
                short_name = short_name[:19]
            new_grouping.write('%s,%s\n' % (name, new_name_to_id[short_name]))
def read_fsl(filename):
    groups = readers.read_grouping('grouping.csv', short=True, map_name=True, strip_colon=False)
    query_to_target = {}
    with open(filename) as f:
        lines = f.readlines()
    for line in lines[1:]:
        tokens = line.strip('\n').split('\t')
        group = groups[tokens[1]]
        if tokens[0] in query_to_target:
            query_to_target[tokens[0]].append((group, tokens[2]))
        else:
            query_to_target[tokens[0]] = [(group, tokens[2])]
    return query_to_target
def read_fsl(filename):
    groups = readers.read_grouping('grouping.csv',
                                   short=True,
                                   map_name=True,
                                   strip_colon=False)
    query_to_target = {}
    with open(filename) as f:
        lines = f.readlines()
    for line in lines[1:]:
        tokens = line.strip('\n').split('\t')
        group = groups[tokens[1]]
        if tokens[0] in query_to_target:
            query_to_target[tokens[0]].append((group, tokens[2]))
        else:
            query_to_target[tokens[0]] = [(group, tokens[2])]
    return query_to_target
def read_fsl(filename):
    groups = readers.read_grouping('grouping.csv', short=True, map_name=True, strip_colon=False)
    query_to_target = {}
    with open(filename) as f:
        lines = f.readlines()
    skipped_matches = 0
    for line in lines[5:]:
        tokens = line.strip('\n').split('\t')
        if int(tokens[0]) < 120 * .4:
            skipped_matches += 1
            continue
        if tokens[9] in groups:
            group = groups[tokens[9]]
        else:
            group = "Unknown"
        if tokens[9] in query_to_target:
            query_to_target[tokens[9]].append((group, tokens[13]))
        else:
            query_to_target[tokens[9]] = [(group, tokens[13])]
    print("matches lower that 40%", skipped_matches)
    return query_to_target
def read_fsl(filename):
    groups = readers.read_grouping('grouping.csv', short=True, map_name=True, strip_colon=False)
    bait_to_match = {}
    with open(filename) as f:
        lines = f.readlines()
    skipped_matches = 0
    for line in lines[5:]:
        tokens = line.strip('\n').split('\t')
        score = int(tokens[0])
        gene = tokens[9]
        bait = tokens[13]
        if bait in bait_to_match:
            if score > bait_to_match[bait][0]:
                bait_to_match[bait] = (score, gene)
        else:
            bait_to_match[bait] = (score, gene)
    total = collections.defaultdict(int)
    for s, g in bait_to_match.values():
        if g in groups:
            total[groups[g]] += 1
        else:
            total['unknown'] += 1
    return total
def read_fsl(filename):
    groups = readers.read_grouping('grouping.csv',
                                   short=True,
                                   map_name=True,
                                   strip_colon=False)
    query_to_target = {}
    with open(filename) as f:
        lines = f.readlines()
    skipped_matches = 0
    for line in lines[5:]:
        tokens = line.strip('\n').split('\t')
        if int(tokens[0]) < 120 * .4:
            skipped_matches += 1
            continue
        if tokens[9] in groups:
            group = groups[tokens[9]]
        else:
            group = "Unknown"
        if tokens[9] in query_to_target:
            query_to_target[tokens[9]].append((group, tokens[13]))
        else:
            query_to_target[tokens[9]] = [(group, tokens[13])]
    print("matches lower that 40%", skipped_matches)
    return query_to_target
def main():
    if len(sys.argv) != 6:
        print("Usage: art_test.py grouping.csv scanresults.scan art_out.maf scores.csv genemark.f")
        sys.exit(0)

    genemark_to_name = read_genemark(sys.argv[5])

    id_to_name = readers.read_grouping(sys.argv[1])
    thresh = []
    fp = []
    tp = []
    nf = []
    contig_to_read = readers.read_maf(sys.argv[3])
    for threshold in range(0, 80, 100):
        mismatch = []
        scan_id_to_name, scan_name_to_id = readers.read_scan_results(threshold, sys.argv[2],protein=True)
        readers.change_RF_to_ARO(scan_name_to_id)
        scan_name_to_id = substitute_read_name(scan_name_to_id, contig_to_read, genemark_to_name)
        found_score = []
        true_positive = 0
        false_positive = 0
        not_found = 0
        total = 0
        for id in sorted(id_to_name.keys()):
            canonical_id = id.split('s')[0]
            canonical_id = 'ARO:' + canonical_id.split('O')[1]
            for name in id_to_name[id]:
                total += 1
                ids = search(name, id, scan_name_to_id)
                if ids:
                    if ids[0][0] == canonical_id:
                        true_positive += 1
                        found_score.append((ids[0][1], id, name))
                    else:
                        if ';' in ids[0][0] and canonical_id in ids[0][0].split(';'):
                            true_positive += 1
                            found_score.append((ids[0][1], id, name))
                        else:
                            #print("False Positive: %s, %s" % (name, id))
                            mismatch.append((name, canonical_id, ids[0][0], ids[0][2]))
                            for i in range(len(ids)):
                                #print("Attempt %d: %s %f" % (i, ids[i][0], ids[i][1]))
                                if ids[i][0] == canonical_id:
                                    found_score.append((ids[i][1], id, name))
                                    break
                            false_positive += 1
                else:
                    #for k,v in contig_to_read.items():
                    #    if name in v:
                    #        #print("Found read %s in %s" % (name, k))
                    #print("Not Found: %s,%s" % (name, id))
                    not_found += 1

        thresh.append(threshold)
        tp.append(true_positive)
        fp.append(false_positive)
        nf.append(not_found)
        print("misclassified %d notfound %d true positive %d of total %d" % (false_positive,
                                                                             not_found,
                                                                             true_positive,
                                                                             total))

        print(threshold, true_positive, false_positive, not_found, total)
    x = np.array(fp)# false_positive_rate
    y = np.array(tp)# true_positive_rate
    t = np.array(thresh)
    n = np.array(nf)

    found_score.sort()
    with open(sys.argv[4], 'w') as f:
        for score, hmm, name in found_score:
            f.write('%s,%d\n' % (hmm, score - 1))
    with open('mismatch.csv', 'w') as f:
        f.write('Gene,Expected ID,Found ID\n')
        for a, b, c, d in mismatch:
            f.write('%s,%s,%s\n' % (b, c, a))

    test.graph(x, y, t, n)
Esempio n. 11
0
    output = ''.join([gen_random_sequence(prefix_len),
                      seq,
                      gen_random_sequence(suffix_len)])
    return "> %d:%d?%s?%s\n%s" %(prefix_len, len(seq) + prefix_len, target, name, output)


def unit_test():
    pattern = 'AAAAAA'
    value = gen_test(pattern, 'adr001', 'test')
    lines = value.split('\n')
    offset, target, name = lines[0][1:].split('?', 2)
    start, end = offset.split(':')
    start = int(start)
    end = int(end)
    if lines[1][start:end] == pattern:
        print("Success")
    else:
        print(lines[1][start:end])

#unit_test()
id_to_name = readers.read_grouping()
name_to_seq = readers.read_fasta()
for id, names in id_to_name.items():
    with open('../test/%s.fa' % id, 'w+') as fasta:
        for name in names:
            seq = name_to_seq[name]
            value = gen_test(seq, id, name)
            fasta.write(value + '\n')

Esempio n. 12
0
def main():
    if len(sys.argv) != 5:
        print(
            "Usage: art_test.py grouping.csv scanresults.scan art_out.maf scores.csv"
        )
        sys.exit(0)

    id_to_name = readers.read_grouping(sys.argv[1])
    thresh = []
    fp = []
    tp = []
    nf = []
    contig_to_read = readers.read_maf(sys.argv[3])
    for threshold in range(0, 81, 100):
        mismatch = []
        scan_id_to_name, scan_name_to_id = readers.read_scan_results(
            threshold, sys.argv[2])
        scan_name_to_id = substitute_read_name(scan_name_to_id, contig_to_read)
        found_score = []
        true_positive = 0
        false_positive = 0
        not_found = 0
        total = 0
        for id in sorted(id_to_name.keys()):
            for name in id_to_name[id]:
                total += 1
                ids = search(name, id, scan_name_to_id)
                if ids:
                    if ids[0][0] == id or ids[0][0].split('s')[0] == id.split(
                            's')[0]:
                        true_positive += 1
                        found_score.append((ids[0][1], id, name))
                    else:
                        #print("False Positive: %s, %s" % (name, id))
                        canonical_id = 'ARO:' + id.split('s')[0].split('O')[1]
                        canonical_id2 = 'ARO:' + ids[0][0].split('s')[0].split(
                            'O')[1]
                        mismatch.append(
                            (name, canonical_id, canonical_id2, ids[0][2]))
                        for i in range(len(ids)):
                            #print("Attempt %d: %s %f" % (i, ids[i][0], ids[i][1]))
                            if ids[i][0] == id:
                                found_score.append((ids[i][1], id, name))
                                break
                        false_positive += 1
                else:
                    for k, v in contig_to_read.items():
                        if name in v:
                            print("Found read %s in %s" % (name, k))
                    print("Not Found: %s,%s" % (name, id))
                    not_found += 1

        thresh.append(threshold)
        tp.append(true_positive)
        fp.append(false_positive)
        nf.append(not_found)
        print("misclassified %d notfound %d true positive %d of total %d" %
              (false_positive, not_found, true_positive, total))

        print(threshold, true_positive, false_positive, not_found, total)
    x = np.array(fp)  # false_positive_rate
    y = np.array(tp)  # true_positive_rate
    t = np.array(thresh)
    n = np.array(nf)

    found_score.sort()
    with open(sys.argv[4], 'w') as f:
        for score, hmm, name in found_score:
            f.write('%s,%d\n' % (hmm, score - 1))
    with open('mismatch.csv', 'w') as f:
        f.write('Gene,Expected ID,Found ID\n')
        for a, b, c, d in mismatch:
            f.write('%s,%s,%s\n' % (b, c, a))

    test.graph(x, y, t, n)
Esempio n. 13
0
def main():
    if len(sys.argv) != 4:
        print("Usage: test.py grouping.csv scanresults.scan threshold.csv")
        sys.exit(0)

    id_to_name = readers.read_grouping(sys.argv[1], short=True)
    thresh = []
    fp = []
    tp = []
    nf = []
    for threshold in range(0, 10, 100):
        scan_id_to_name, scan_name_to_id = readers.read_scan_results(threshold, sys.argv[2])
        found_score = []
        mismatch = []
        true_positive = 0
        false_positive = 0
        not_found = 0
        total = 0
        for id in id_to_name.keys():
            for name in id_to_name[id]:
                total += 1
                if name not in scan_name_to_id:
                    print("Not found %s,%s" % (name, id))
                    not_found += 1
                    continue
                ids = scan_name_to_id[name]
                ids.sort(key=lambda l: l[1], reverse=True)
                if ids[0][0] == id or ids[0][0].split('s')[0] == id.split('s')[0]:
                    true_positive += 1
                    found_score.append((ids[0][1], id, name))
                else:
                    print("False Positive: %s, %s" % (name, id))
                    if len(ids) > 1 and ids[1][0] == id:
                        mismatch.append((name, id, ids[0][0]))
                    else:
                        for i in range(len(ids)):
                            print("Attempt %d: %s %f" % (i, ids[i][0], ids[i][1]))
                            if ids[i][0] == id:
                                found_score.append((ids[i][1], id, name))
                                break
                            else:
                                mismatch.append((name, id, ids[i][0]))

                    false_positive += 1


        thresh.append(threshold)
        tp.append(true_positive)
        fp.append(false_positive)
        nf.append(not_found)

    x = np.array(fp)# false_positive_rate
    y = np.array(tp)# true_positive_rate
    t = np.array(thresh)
    n = np.array(nf)
    total = x + y + n

    found_score.sort()
    with open(sys.argv[3], 'w') as f:
        for score, hmm, name in found_score:
            f.write('%s,%d\n' % (hmm, score - 1))
    with open('mismatch.csv', 'w') as f:
        f.write('Gene,Expected ID,Found ID\n')
        for a, b, c in mismatch:
            f.write('%s,%s,%s\n' % (a, b, c))
    print("misclassified %d notfound %d true positive %d of total %d" % (x, n, y, total))
Esempio n. 14
0
import readers
import sys

# read input files
name_to_sequence = readers.read_fasta(sys.argv[1])
id_to_name = readers.read_grouping(sys.argv[2])

# create fasta files for each id
readers.create_fasta_file_for_each_id(name_to_sequence, id_to_name,
                                      sys.argv[3])