Esempio n. 1
0
def tst(noise, target, source):
    print '##', noise
    sep = Taxonomy.getRawTaxonomy('tax/skel/', 'ott')
    t = Taxonomy.getRawTaxonomy(target, 'target')
    s = Taxonomy.getRawTaxonomy(source, 'source')
    u = combine(sep, t, s, blustery)
    u.dumpChoices('/tmp/align_tests_choices.tsv')
    subprocess.call(['cat', '/tmp/align_tests_choices.tsv'])
    if False:
        u.dumpLog('/tmp/align_tests_log.tsv')
        subprocess.call(['cat', '/tmp/align_tests_log.tsv'])
    print
def doit(tax_path, ids_path):

    ott = Taxonomy.getRawTaxonomy(tax_path, 'ott')

    all_nodes = {}

    with open(ids_path, 'r') as infile:
        reader = csv.reader(infile, delimiter='\t')
        otu_count = 0
        for row in reader:
            id = row[0]
            if otu_count % 50000 == 0: print otu_count, id
            otu_count += 1
            node = ott.lookupId(id)
            if node != None:
                all_nodes[node.id] = node

    print 'OTT taxa assigned to OTUs:', len(all_nodes)

    prefix_to_count = {}
    ott_count = 0

    for id in all_nodes:
        node = all_nodes[id]
        ott_count += 1
        for qid in node.sourceIds:
            prefix = qid.prefix
            count = prefix_to_count.get(prefix, 0)
            prefix_to_count[prefix] = count + 1

    print 'OTT ids assigned to OTUs:', otu_count
    for prefix in prefix_to_count:
        print prefix, prefix_to_count[prefix]
def report(dir, idspace):
    tax = Taxonomy.getRawTaxonomy(os.path.join('tax', dir, ''), idspace)
    # tax.smush() 
    # HomonymReport.homonymDensityReport(tax, dir + '-density-report.csv')
    # HomonymReport.homonymUncertaintyReport(tax, 'reports/' + dir + '-uncertainty-report.csv')
    if not os.path.isdir(report_dir):
        os.makedirs(report_dir)
    HomonymReport.homonymReport(tax, os.path.join(report_dir, dir + '-homonym-report.csv'))
Esempio n. 4
0
def report(dir, idspace):
    tax = Taxonomy.getRawTaxonomy(os.path.join('tax', dir, ''), idspace)
    # tax.smush()
    # HomonymReport.homonymDensityReport(tax, dir + '-density-report.csv')
    # HomonymReport.homonymUncertaintyReport(tax, 'reports/' + dir + '-uncertainty-report.csv')
    if not os.path.isdir(report_dir):
        os.makedirs(report_dir)
    HomonymReport.homonymReport(
        tax, os.path.join(report_dir, dir + '-homonym-report.csv'))
def doit(ott, sep, outpath, conpath):

    do_rug = False  #os.path.isdir('out/ruggiero')

    if do_rug:
        rug = Taxonomy.getRawTaxonomy('out/ruggiero/', 'rug')
        # Prepare for conflict analysis
        # oh no, we really need a separation taxonomy to do that.
        rug_alignment = AlignmentByName(rug, ott)
        rug_alignment.align()
        rug_conflict = ConflictAnalysis(rug, ott, rug_alignment, True)

    overall_table(ott, outpath)
    source_breakdown_table(ott, conpath)
Esempio n. 6
0
def doit(ott, sep, outpath, conpath):

    do_rug = False  #os.path.isdir('out/ruggiero')

    if do_rug:
        rug = Taxonomy.getRawTaxonomy('out/ruggiero/', 'rug')
        # Prepare for conflict analysis
        # oh no, we really need a separation taxonomy to do that.
        rug_alignment = AlignmentByName(rug, ott)
        rug_alignment.align()
        rug_conflict = ConflictAnalysis(rug, ott, rug_alignment, True)

    overall_table(ott, outpath)
    source_breakdown_table(ott, conpath)
Esempio n. 7
0
def retain_ids(ott, prev_path, by_qid):

    # ad hoc assignments specifically for NCBI taxa, basedon NCBI id

    for (ncbi_id, ott_id, name) in ncbi_ott_assignments.ncbi_assignments_list:
        im = ott.lookupQid(QualifiedId('ncbi', ncbi_id))
        if im == None:
            print '* ncbi:%s not found in OTT - %s' % (ncbi_id, name)
        else:
            if im.name != name:
                print '* ncbi:%s name is %s, but expected %s' % (ncbi_id,
                                                                 im.name, name)
            im.addId(ott_id)

    # Force some id assignments... will try to automate this in the future.
    # Most of these come from looking at the deprecated.tsv file after a
    # series of smasher runs.

    for (inf, sup, id) in [
        ('Tipuloidea', 'Diptera', '722875'),
        ('Saccharomycetes', 'Saccharomycotina', '989999'),
        ('Phaeosphaeria', 'Ascomycota', '5486272'),
        ('Synedra acus', 'Eukaryota', '992764'),
        ('Hessea', 'Archaeplastida', '600099'),
        ('Morganella', 'Arthropoda', '6400'),
        ('Rhynchonelloidea', 'Rhynchonellidae', '5316010'),
        ('Morganella', 'Fungi', '973932'),
        ('Parmeliaceae', 'Lecanorales', '305904'),
        ('Cordana', 'Ascomycota', '946160'),
        ('Pseudofusarium', 'Ascomycota', '655794'),
        ('Marssonina', 'Dermateaceae', '372158'),  # ncbi:324777
        ('Marssonia', 'Lamiales', '5512668'),  # gbif:7268388
            # ('Gloeosporium', 'Pezizomycotina', '75019'),  # synonym for Marssonina
        ('Escherichia coli', 'Enterobacteriaceae', '474506'),  # ncbi:562
            # ('Dischloridium', 'Trichocomaceae', '895423'),
        ('Exaiptasia pallida', 'Cnidaria', '135923'),
        ('Choanoflagellida', 'Holozoa', '202765'),
        ('Billardiera', 'Lamiales', '798963'),
        ('Trachelomonas grandis', 'Bacteria', '58035'),  # study ot_91 Tr46259
        ('Hypomyzostoma', 'Myzostomida',
         '552744'),  # was incorrectly in Annelida
        ('Gyromitus', 'SAR', '696946'),
        ('Pseudogymnoascus destructans', 'Pezizomycotina', '428163'),
            # ('Amycolicicoccus subflavus', 'Mycobacteriaceae', '541768'),  # ncbi:639313
            # ('Pohlia', 'Foraminifera', '5325989')  - NO
        ('Pohlia', 'Amphibia', '5325989'),  # irmng:1311321
        ('Phyllanthus', 'Pentapetalae', '452944'),  # pg_25 @josephwb = 5509975
    ]:
        tax = ott.maybeTaxon(inf, sup)
        if tax != None:
            tax.setId(id)

    ott.taxon('452944').addId('5509975')

    # ott.taxon('474506') ...

    ott.taxonThatContains('Rhynchonelloidea',
                          'Sphenarina').setId('795939')  # NCBI

    # Trichosporon is a mess, because it occurs 3 times in NCBI.
    trich = ott.taxonThatContains('Trichosporon', 'Trichosporon cutaneum')
    if trich != None:
        trich.setId('364222')

    #ott.image(fungi.taxon('11060')).setId('4107132') #Cryptococcus - a total mess

    # --------------------
    # Assign OTT ids to taxa that don't have them, re-using old ids when possible
    ids = Taxonomy.getRawTaxonomy(prev_path, 'ott')

    # Edit the id source taxonomy to optimize id coverage

    # Kludge to undo lossage in OTT 2.9
    for taxon in ids.taxa():
        if (len(taxon.sourceIds) >= 2 and taxon.sourceIds[0].prefix == "ncbi"
                and taxon.sourceIds[1].prefix == "silva"):
            taxon.sourceIds.remove(taxon.sourceIds[0])

    # OTT 2.9 has both Glaucophyta and Glaucophyceae...
    # this creates an ambiguity when aligning.
    # Need to review this; maybe they *should* be separate taxa.
    g1 = ids.maybeTaxon('Glaucophyta')
    g2 = ids.maybeTaxon('Glaucophyceae')
    if g1 != None and g2 != None and g1 != g2:
        g1.absorb(g2)

    # Assign old ids to nodes in the new version
    ott.carryOverIds(ids)  # Align & copy ids

    print '-- Checking id list'
    retain_ids_from_list(ott, by_qid)
def tst(target, source, want):
    global tests
    t = Taxonomy.getRawTaxonomy(target, 'target')
    s = Taxonomy.getRawTaxonomy(source, 'source')
    u = combine(t, s, blustery)
    tests.append((t, s, u, want))
import sys, codecs
from org.opentreeoflife.taxa import Taxonomy, Newick

source = sys.argv[1]    # Name of directory containing original taxonomy (must end in /)
name = sys.argv[2]      # Name of taxon to extract
dest = sys.argv[3]      # Directory to store result (must end in /)

if not (dest.endswith('/') or dest.endswith('.tre')):
    print >>sys.stderr, 'Invalid taxonomy destination (need / or .tre)', dest
    sys.exit(1)

selection = Taxonomy.getRawTaxonomy(source, 'foo').select(name)

if dest.endswith('.tre'):
    with codecs.open(dest, 'w', 'utf-8') as outfile:
        outfile.write(Newick.toNewick(selection, Newick.USE_NAMES_AND_IDS))
        outfile.write('\n')
else:
    selection.dump(dest)
Esempio n. 10
0
import sys
from org.opentreeoflife.taxa import Taxonomy, Newick

source = sys.argv[1]    # Name of directory containing original taxonomy (must end in /)

ott = Taxonomy.getRawTaxonomy(source, 'ott')

count = 0
grafts = 0
non_tip_grafts = 0

# Seen = seen idspaces among ancestors.
# Returns set of seen idspaces.

def recur(taxon, seen):
    global count, grafts, non_tip_grafts
    count += 1

    # idspace (source) of taxon
    space = taxon.sourceIds.get(0).prefix

    all = empty()
    seen_child = adjoin(space, seen)
    for child in taxon.getChildren():
        under = recur(child, seen_child)
        child_space = child.sourceIds.get(0).prefix
        if child_space != space:
            # A graft or resolution.
            if intersectp(under, seen):
                # A resolution.
                print 'resolve', child, taxon, child.rank
    * copied = total number of nodes originating from this source (copied)
    * aligned = number of source nodes aligned and copied
    * absorbed = number of source nodes absorbed (not copied)
    * conflict = number of inconsistent source nodes (not copied)
    """

def dump_table_as_csv(table, outfile):
    # Provide CSV form for Pensoft
    writer = csv.writer(outfile)
    for row in table:
        writer.writerow(row)

def max_depth(node):
    m = 0
    for child in node.getChildren():
        d = max_depth(child) + 1
        if d > m: m = d
    return m

if __name__ == '__main__':

    taxpath = sys.argv[1]
    seppath = sys.argv[2]
    outpath = sys.argv[3]  # general report, JSON
    conpath = sys.argv[4]  # contributions, CSV
    sep = Taxonomy.getRawTaxonomy(seppath, 'ott')
    ott = Taxonomy.getRawTaxonomy(taxpath, 'ott')
    ott.inferFlags()

    doit(ott, sep, outpath, conpath)
Esempio n. 12
0
import sys

from org.opentreeoflife.taxa import Taxonomy, Rank

ott = Taxonomy.getRawTaxonomy(sys.argv[1], 'ott')

# Look for splitting:
#   Suppose X, Y are distinct in GBIF, but both align to X in NCBI,
#   because NCBI says Y a synonym of X.
#   Then we have X in NCBI with GBIF X and Y aligning to it, and
#   Y a synonym via NCBI but not via GBIF.
#   So GBIF X is a source for X, and GBIF Y is in sources for Y-synonym of X.

for X in ott.taxa():
    # Species only
    if X.rank != Rank.SPECIES_RANK: continue

    xid = X.sourceIds[0].id

    # Look for Y, a synonym of X...
    for Y in X.getSynonyms():

        yids = [qid.id for qid in Y.sourceIds]

        # that has same source as X...
        if not xid in yids:
            continue

        # but, an alignment from Y
        for yid in yids:
Esempio n. 13
0
def tst(target, source, want):
    global tests
    t = Taxonomy.getRawTaxonomy(target, 'target')
    s = Taxonomy.getRawTaxonomy(source, 'source')
    u = combine(t, s, blustery)
    tests.append((t, s, u, want))
Esempio n. 14
0
    * conflict = number of inconsistent source nodes (not copied)
    """


def dump_table_as_csv(table, outfile):
    # Provide CSV form for Pensoft
    writer = csv.writer(outfile)
    for row in table:
        writer.writerow(row)


def max_depth(node):
    m = 0
    for child in node.getChildren():
        d = max_depth(child) + 1
        if d > m: m = d
    return m


if __name__ == '__main__':

    taxpath = sys.argv[1]
    seppath = sys.argv[2]
    outpath = sys.argv[3]  # general report, JSON
    conpath = sys.argv[4]  # contributions, CSV
    sep = Taxonomy.getRawTaxonomy(seppath, 'ott')
    ott = Taxonomy.getRawTaxonomy(taxpath, 'ott')
    ott.inferFlags()

    doit(ott, sep, outpath, conpath)
            return qid
    return None

def same_epithet(name1, name2):
    epi1 = epithet_stem(name1)
    if epi1 == None: return False
    epi2 = epithet_stem(name2)
    if epi2 == None: return False
    return epi1 == epi2

def epithet_stem(name):
    s = name.split(' ', 1)
    epi = s[-1]
    if epi.endswith('us'): return epi[0:-2]
    if epi.endswith('um'): return epi[0:-2]
    if epi.endswith('a'): return epi[0:-1]
    return epi

if __name__ == '__main__':
    taxpath = sys.argv[1]
    inpath = sys.argv[2]
    outpath = sys.argv[3]
    reportpath = sys.argv[4]

    tax = Taxonomy.getRawTaxonomy(taxpath, 'ott')
    tax.startQidIndex()
    print 'ncbi:98683 =', tax.lookupQid(QualifiedId('ncbi', '98683'))
    (page_to_nodes, node_to_pages) = load_eol_page_ids(inpath, tax)
    dump_mapping(node_to_pages, outpath)
    report(page_to_nodes, reportpath)