def test_referenceset(): refstatsfile = get_testfile('cm-500pgun-ref-stats') refphylfile = get_testfile('cm-ref-phyl') r = Reference(['Acidobacterium capsulatum ATCC 51196', 'Acidobacterium capsulatum', 'Acidobacterium', 'Acidobacteriaceae', 'Acidobacteriales', 'Acidobacteriia', 'Acidobacteria', 'Fibrobacteres/Acidobacteria group', 'Bacteria', 'cellular organisms']) assert(r.phyl["life"] == "cellular organisms") refs = ReferenceSet(refphylfile, refstatsfile) # There are 59 references, all indexable by id, fasta name or strain name assert(len(refs) == 59) assert(len(refs.refs) == 177) # Every reference's full phylogeny is in the phyl dictionary attribute assert(refs.get("Acidobacterium capsulatum ATCC 51196").phyl["life"] == "cellular organisms") # The LCA of Shewanella baltica strains should be on the species level sh1 = refs.get("Shewanella_baltica_OS185") sh2 = refs.get("Shewanella_baltica_OS223,") assert(sh1.get_lca(sh1) == ('strain', 'Shewanella baltica OS185')) assert(sh1.get_lca(sh2) == ('species', 'Shewanella baltica')) # LCA of an Archaea and a Bacteria should be life ar = refs.get("Archaeoglobus_fulgidus_DSM_4304") th = refs.get("gi|222528057|ref|NC_012034.1|") assert(ar.get_lca(th) == ('life', 'cellular organisms')) # Highest LCA of Bacteria and Archaea is Life assert(ar.get_highest_lca([th, sh1, sh2]) == ('life', 'cellular organisms')) # Highest LCA of given bacteria is domain or superkingdom in NCBI naming assert(th.get_highest_lca([sh1, sh2]) == ('superkingdom', 'Bacteria')) assert(sh1.get_highest_lca([sh1]) == ('strain', 'Shewanella baltica OS185'))
def test_read_contig_mappings(): bamasm = get_testfile('cm-500pgun-asm-b2mv31-bam') contigfa = get_testfile('cm-500pgun-asm-b2mv31-fa') bamref = get_testfile('cm-500pgun-ref-bam') refstatsfile = get_testfile('cm-500pgun-ref-stats') refphylfile = get_testfile('cm-ref-phyl') refs = ReferenceSet(refphylfile, refstatsfile) reads, contigs = get_read_contig_mappings(bamref, bamasm, refs, contigfa)
def test_assemblyvalidation(): bamasm = get_testfile('cm-500pgun-asm-b2mv31-bam') contigfa = get_testfile('cm-500pgun-asm-b2mv31-fa') bamref = get_testfile('cm-500pgun-ref-bam') refstatsfile = get_testfile('cm-500pgun-ref-stats') refphylfile = get_testfile('cm-ref-phyl') nucmercoords = get_testfile('cm-500pgun-val-nucmer') val = AssemblyValidation(bamref, bamasm, refphylfile, refstatsfile, contigfa, nucmercoords) assert(len(val.contigs) == int(get_shell_output("grep -c '^>' " + contigfa)[0])) make_dir(get_outdir() + "masm") val.write_contig_purity(get_outdir() + "masm" + "/contig-purity.tsv") val.write_general_stats(get_outdir() + "masm" + "/asm-stats.tsv") val.write_genome_contig_cov(get_outdir() + "masm" + "/genome-contig-coverage.tsv")