def main():
    threshold = 150
    scan_id_to_name, scan_name_to_id = readers.read_scan_results(threshold, sys.argv[1])
    genes = [(len(v), k) for k,v in scan_id_to_name.items()]
    genes.sort(reverse=True)
    for c,g in genes:
        print('%s,%d' % (g, c))
Beispiel #2
0
def main():
    threshold = 150
    scan_id_to_name, scan_name_to_id = readers.read_scan_results(
        threshold, sys.argv[1])
    genes = [(len(v), k) for k, v in scan_id_to_name.items()]
    genes.sort(reverse=True)
    for c, g in genes:
        print('%s,%d' % (g, c))
def main():
    if len(sys.argv) < 3 or len(sys.argv) > 4:
        print("Usage: functional_test.py [-protein] results.scan seq.fasta")
        sys.exit(-1)
    protein = True if sys.argv[1] == '-protein' else False
    if protein:
        scan_file = sys.argv[2]
        fasta_file = sys.argv[3]
        protein_map = {
        }  # place to store mapping of protein name to dna name for genes
    else:
        scan_file = sys.argv[1]
        fasta_file = sys.argv[2]

    genes = readers.read_fasta(fasta_file)
    positive_count = 0
    negative_count = 0

    for gene in genes.keys():
        if 'True' in gene:
            positive_count += 1
        elif 'False' in gene:
            negative_count += 1
        if protein:
            names = gene.split('>')
            protein_map[names[0].strip()] = names[1].strip()

    id_to_target, target_to_id = readers.read_scan_results(0,
                                                           scan_file,
                                                           protein=protein)
    false_positive = 0
    true_positive = 0
    already_seen_protein = set(
    )  # We don't want to double count if we have seen the same gene
    for key in target_to_id.keys():
        if protein:
            key = protein_map[key]
            if key in already_seen_protein:
                continue
            else:
                already_seen_protein.add(key)
        if key.startswith('False'):
            false_positive += 1
        elif key.startswith('True'):
            true_positive += 1

    print("True Positive: %d/%d(%f); False Positive: %d/%d(%f)" %
          (true_positive, positive_count,
           float(true_positive) / positive_count, false_positive,
           negative_count, float(false_positive) / negative_count))
def main():
    if len(sys.argv) != 6:
        print("Usage: art_test.py grouping.csv scanresults.scan art_out.maf scores.csv genemark.f")
        sys.exit(0)

    genemark_to_name = read_genemark(sys.argv[5])

    id_to_name = readers.read_grouping(sys.argv[1])
    thresh = []
    fp = []
    tp = []
    nf = []
    contig_to_read = readers.read_maf(sys.argv[3])
    for threshold in range(0, 80, 100):
        mismatch = []
        scan_id_to_name, scan_name_to_id = readers.read_scan_results(threshold, sys.argv[2],protein=True)
        readers.change_RF_to_ARO(scan_name_to_id)
        scan_name_to_id = substitute_read_name(scan_name_to_id, contig_to_read, genemark_to_name)
        found_score = []
        true_positive = 0
        false_positive = 0
        not_found = 0
        total = 0
        for id in sorted(id_to_name.keys()):
            canonical_id = id.split('s')[0]
            canonical_id = 'ARO:' + canonical_id.split('O')[1]
            for name in id_to_name[id]:
                total += 1
                ids = search(name, id, scan_name_to_id)
                if ids:
                    if ids[0][0] == canonical_id:
                        true_positive += 1
                        found_score.append((ids[0][1], id, name))
                    else:
                        if ';' in ids[0][0] and canonical_id in ids[0][0].split(';'):
                            true_positive += 1
                            found_score.append((ids[0][1], id, name))
                        else:
                            #print("False Positive: %s, %s" % (name, id))
                            mismatch.append((name, canonical_id, ids[0][0], ids[0][2]))
                            for i in range(len(ids)):
                                #print("Attempt %d: %s %f" % (i, ids[i][0], ids[i][1]))
                                if ids[i][0] == canonical_id:
                                    found_score.append((ids[i][1], id, name))
                                    break
                            false_positive += 1
                else:
                    #for k,v in contig_to_read.items():
                    #    if name in v:
                    #        #print("Found read %s in %s" % (name, k))
                    #print("Not Found: %s,%s" % (name, id))
                    not_found += 1

        thresh.append(threshold)
        tp.append(true_positive)
        fp.append(false_positive)
        nf.append(not_found)
        print("misclassified %d notfound %d true positive %d of total %d" % (false_positive,
                                                                             not_found,
                                                                             true_positive,
                                                                             total))

        print(threshold, true_positive, false_positive, not_found, total)
    x = np.array(fp)# false_positive_rate
    y = np.array(tp)# true_positive_rate
    t = np.array(thresh)
    n = np.array(nf)

    found_score.sort()
    with open(sys.argv[4], 'w') as f:
        for score, hmm, name in found_score:
            f.write('%s,%d\n' % (hmm, score - 1))
    with open('mismatch.csv', 'w') as f:
        f.write('Gene,Expected ID,Found ID\n')
        for a, b, c, d in mismatch:
            f.write('%s,%s,%s\n' % (b, c, a))

    test.graph(x, y, t, n)
def main():
    if len(sys.argv) < 3 or len(sys.argv) > 4:
        print(
            "Usage: functional_compare.py [-fsl] [-protein] results.scan seq.fasta"
        )
        sys.exit(-1)
    fsl = True if sys.argv[1] == '-fsl' else False
    protein = True if sys.argv[1] == '-protein' else False
    if fsl:
        fsl_file = sys.argv[2]
        fasta_file = sys.argv[3]
    elif protein:
        scan_file = sys.argv[2]
        fasta_file = sys.argv[3]
    else:
        scan_file = sys.argv[1]
        fasta_file = sys.argv[2]
    genes = readers.read_fasta(fasta_file)
    if fsl:
        target_to_id = read_fsl(fsl_file)
    else:
        id_to_target, target_to_id = readers.read_scan_results(0,
                                                               scan_file,
                                                               protein=protein)
    if protein:
        readers.change_RF_to_ARO(target_to_id)
        already_seen_protein = set(
        )  # We don't want to double count if we have seen the same gene

    terms = ontology_common.parse_obo('new_combined.obo')
    false_positive = 0
    true_positive = 0
    false_negative = 0
    for gene in genes.keys():
        if protein:
            names = gene.split('>')
            gene = names[1].strip()
            name = names[0].strip()
            if gene in already_seen_protein:
                continue
            else:
                already_seen_protein.add(gene)
        else:
            name = gene
        found = name in target_to_id
        if found:
            antibiotic = gene.split('_')[1]
            functional_antibiotic = antibiotic_code[antibiotic]
            results = target_to_id[name]
            results.sort(key=lambda l: l[1], reverse=True)
            index = 0
            while index < len(results):
                result = results[index]
                index += 1
                id = result[0]
                # remove formatting used by hmm
                if 's' in id:
                    id = id.replace('ARO', 'ARO:')
                    id = id.split('s')[0]
                if ';' in id:
                    # resfams can have a list of ids associated with a gene
                    classes = [
                        terms[p]['name'] for i in id.split(';')
                        for p in ontology_common.get_class(i, terms)
                    ]
                    drugs = set()
                    for i in id.split(';'):
                        drugs |= ontology_common.get_resistance(
                            ontology_common.get_lineage(i, terms), terms)
                else:
                    classes = [
                        terms[p]['name']
                        for p in ontology_common.get_class(id, terms)
                    ]
                    drugs = ontology_common.get_resistance(
                        ontology_common.get_lineage(id, terms), terms)
                identified = False
                for drug in drugs:
                    for d in ontology_common.get_lineage(drug, terms):
                        for fd in ontology_common.get_lineage(
                                functional_antibiotic[1], terms):
                            if d == fd and d not in [
                                    'ARO:1000001', 'ARO:1000003', 'Unknown'
                            ]:
                                identified = True

                if identified:
                    true_positive += 1
                    break
        else:
            false_negative += 1
        if found and not identified:
            print(gene, functional_antibiotic, id, classes, drugs)
            false_positive += 1

    print('False negative: %d; False Positive:%d; True Positive:%d' %
          (false_negative, false_positive, true_positive))
def main():
    if len(sys.argv) != 5:
        print(
            "Usage: art_test.py grouping.csv scanresults.scan art_out.maf scores.csv"
        )
        sys.exit(0)

    id_to_name = readers.read_grouping(sys.argv[1])
    thresh = []
    fp = []
    tp = []
    nf = []
    contig_to_read = readers.read_maf(sys.argv[3])
    for threshold in range(0, 81, 100):
        mismatch = []
        scan_id_to_name, scan_name_to_id = readers.read_scan_results(
            threshold, sys.argv[2])
        scan_name_to_id = substitute_read_name(scan_name_to_id, contig_to_read)
        found_score = []
        true_positive = 0
        false_positive = 0
        not_found = 0
        total = 0
        for id in sorted(id_to_name.keys()):
            for name in id_to_name[id]:
                total += 1
                ids = search(name, id, scan_name_to_id)
                if ids:
                    if ids[0][0] == id or ids[0][0].split('s')[0] == id.split(
                            's')[0]:
                        true_positive += 1
                        found_score.append((ids[0][1], id, name))
                    else:
                        #print("False Positive: %s, %s" % (name, id))
                        canonical_id = 'ARO:' + id.split('s')[0].split('O')[1]
                        canonical_id2 = 'ARO:' + ids[0][0].split('s')[0].split(
                            'O')[1]
                        mismatch.append(
                            (name, canonical_id, canonical_id2, ids[0][2]))
                        for i in range(len(ids)):
                            #print("Attempt %d: %s %f" % (i, ids[i][0], ids[i][1]))
                            if ids[i][0] == id:
                                found_score.append((ids[i][1], id, name))
                                break
                        false_positive += 1
                else:
                    for k, v in contig_to_read.items():
                        if name in v:
                            print("Found read %s in %s" % (name, k))
                    print("Not Found: %s,%s" % (name, id))
                    not_found += 1

        thresh.append(threshold)
        tp.append(true_positive)
        fp.append(false_positive)
        nf.append(not_found)
        print("misclassified %d notfound %d true positive %d of total %d" %
              (false_positive, not_found, true_positive, total))

        print(threshold, true_positive, false_positive, not_found, total)
    x = np.array(fp)  # false_positive_rate
    y = np.array(tp)  # true_positive_rate
    t = np.array(thresh)
    n = np.array(nf)

    found_score.sort()
    with open(sys.argv[4], 'w') as f:
        for score, hmm, name in found_score:
            f.write('%s,%d\n' % (hmm, score - 1))
    with open('mismatch.csv', 'w') as f:
        f.write('Gene,Expected ID,Found ID\n')
        for a, b, c, d in mismatch:
            f.write('%s,%s,%s\n' % (b, c, a))

    test.graph(x, y, t, n)
def main():
    if len(sys.argv) < 3 or len(sys.argv) > 4:
        print("Usage: functional_compare.py [-fsl] [-protein] results.scan seq.fasta")
        sys.exit(-1)
    fsl = True if sys.argv[1] == '-fsl' else False
    protein = True if sys.argv[1] == '-protein' else False
    if fsl:
        fsl_file = sys.argv[2]
        fasta_file = sys.argv[3]
    elif protein:
        scan_file = sys.argv[2]
        fasta_file = sys.argv[3]
    else:
        scan_file = sys.argv[1]
        fasta_file = sys.argv[2]
    genes = readers.read_fasta(fasta_file)
    if fsl:
        target_to_id = read_fsl(fsl_file)
    else:
        id_to_target, target_to_id = readers.read_scan_results(0, scan_file, protein=protein)
    if protein:
        readers.change_RF_to_ARO(target_to_id)
        already_seen_protein = set() # We don't want to double count if we have seen the same gene

    terms = ontology_common.parse_obo('new_combined.obo')
    false_positive = 0
    true_positive = 0
    false_negative = 0
    for gene in genes.keys():
        if protein:
            names = gene.split('>')
            gene = names[1].strip()
            name = names[0].strip()
            if gene in already_seen_protein:
                continue
            else:
                already_seen_protein.add(gene)
        else:
            name = gene
        found = name in target_to_id
        if found:
            antibiotic = gene.split('_')[1]
            functional_antibiotic = antibiotic_code[antibiotic]
            results = target_to_id[name]
            results.sort(key=lambda l: l[1], reverse=True)
            index = 0
            while index < len(results):
                result = results[index]
                index += 1
                id = result[0]
                # remove formatting used by hmm
                if 's' in id:
                    id = id.replace('ARO', 'ARO:')
                    id = id.split('s')[0]
                if ';' in id:
                    # resfams can have a list of ids associated with a gene
                    classes = [terms[p]['name'] for i in id.split(';') for p in ontology_common.get_class(i, terms)]
                    drugs = set()
                    for i in id.split(';'):
                        drugs |= ontology_common.get_resistance(ontology_common.get_lineage(i, terms), terms)
                else:
                    classes = [terms[p]['name'] for p in ontology_common.get_class(id, terms)]
                    drugs = ontology_common.get_resistance(ontology_common.get_lineage(id, terms), terms)
                identified = False
                for drug in drugs:
                    for d in ontology_common.get_lineage(drug, terms):
                        for fd in ontology_common.get_lineage(functional_antibiotic[1], terms):
                            if d == fd and d not in ['ARO:1000001', 'ARO:1000003', 'Unknown']:
                                identified = True

                if identified:
                    true_positive += 1
                    break
        else:
            false_negative += 1
        if found and not identified:
            print(gene, functional_antibiotic, id, classes, drugs)
            false_positive += 1


    print('False negative: %d; False Positive:%d; True Positive:%d' % (false_negative, false_positive, true_positive))
Beispiel #8
0
def main():
    if len(sys.argv) != 4:
        print("Usage: test.py grouping.csv scanresults.scan threshold.csv")
        sys.exit(0)

    id_to_name = readers.read_grouping(sys.argv[1], short=True)
    thresh = []
    fp = []
    tp = []
    nf = []
    for threshold in range(0, 10, 100):
        scan_id_to_name, scan_name_to_id = readers.read_scan_results(threshold, sys.argv[2])
        found_score = []
        mismatch = []
        true_positive = 0
        false_positive = 0
        not_found = 0
        total = 0
        for id in id_to_name.keys():
            for name in id_to_name[id]:
                total += 1
                if name not in scan_name_to_id:
                    print("Not found %s,%s" % (name, id))
                    not_found += 1
                    continue
                ids = scan_name_to_id[name]
                ids.sort(key=lambda l: l[1], reverse=True)
                if ids[0][0] == id or ids[0][0].split('s')[0] == id.split('s')[0]:
                    true_positive += 1
                    found_score.append((ids[0][1], id, name))
                else:
                    print("False Positive: %s, %s" % (name, id))
                    if len(ids) > 1 and ids[1][0] == id:
                        mismatch.append((name, id, ids[0][0]))
                    else:
                        for i in range(len(ids)):
                            print("Attempt %d: %s %f" % (i, ids[i][0], ids[i][1]))
                            if ids[i][0] == id:
                                found_score.append((ids[i][1], id, name))
                                break
                            else:
                                mismatch.append((name, id, ids[i][0]))

                    false_positive += 1


        thresh.append(threshold)
        tp.append(true_positive)
        fp.append(false_positive)
        nf.append(not_found)

    x = np.array(fp)# false_positive_rate
    y = np.array(tp)# true_positive_rate
    t = np.array(thresh)
    n = np.array(nf)
    total = x + y + n

    found_score.sort()
    with open(sys.argv[3], 'w') as f:
        for score, hmm, name in found_score:
            f.write('%s,%d\n' % (hmm, score - 1))
    with open('mismatch.csv', 'w') as f:
        f.write('Gene,Expected ID,Found ID\n')
        for a, b, c in mismatch:
            f.write('%s,%s,%s\n' % (a, b, c))
    print("misclassified %d notfound %d true positive %d of total %d" % (x, n, y, total))