def tearDown(self): # remove supertaxa or we'll get a CascadedDeleteError for t in self.apes: t.supertaxon = None t.save() for t in self.apes: Taxon.delete(t)
def taxonomy_tree(): node_level = "superkingdom" node_values= Taxon.top_entry_taxa() return render_template('taxon_tree.html', node_level = node_level, node_values = node_values )
def add_taxon(tsn): #http://www.itis.gov/ITISWebService/services/ITISService/getScientificNameFromTSN?tsn=531894 taxon = Taxon(tsn=tsn) sci_name_xml = _get_itis('getScientificNameFromTSN', {'tsn': tsn}) sci_name = None for i in range(1,5): this_name = sci_name_xml.xpath("//ns:return/ax:unitName%d" % i, namespaces=itis_namespaces)[0].text if this_name: sci_name = this_name taxon.name = sci_name rank_name_xml = _get_itis('getTaxonomicRankNameFromTSN', {'tsn': tsn}) rank_name = rank_name_xml.xpath("//ns:return/ax:rankName", namespaces=itis_namespaces)[0].text taxon.rank = Taxon.ITIS_RANKS[rank_name] supertaxon_xml = _get_itis('getParentTSNFromTSN', {'tsn': tsn}) supertaxon_tsn = supertaxon_xml.xpath("//ns:return/ax:parentTsn", namespaces=itis_namespaces)[0] supertaxon_tsn = int(supertaxon_tsn.text) q = Taxon.objects.filter(tsn=supertaxon_tsn) if q.exists(): taxon.supertaxon = q[0] common_names_xml = _get_itis('getCommonNamesFromTSN', {'tsn': tsn}) common_names_elements = common_names_xml.xpath("//ns:return/ax:commonNames", namespaces=itis_namespaces) common_names = [] for name in common_names_elements: name = objectify.fromstring(etree.tostring(name)) nil_key = '{%s}nil' % itis_namespaces['xsi'] if nil_key in name.attrib: if name.attrib[nil_key] == 'true': continue if not len(name): continue if name.language != "English": continue common_names.append(unicode(name.commonName)) taxon.common_names = ", ".join(common_names) return taxon
def taxon_table(): taxonomy_levels = Taxon.level_order taxon_level = request.args.get('taxon_level', 'superkingdom') parent_values = request.args.getlist('parent_values[]', None) parent_level = request.args.get('parent_level', None) row_limit = request.args.get('row_limit', 20) if taxon_level not in taxonomy_levels: taxon_level = 'superkingdom' if parent_level not in taxonomy_levels: parent_values = None if row_limit not in ['20', '50', '100', 'all']: row_limit = 20 # Translate to model language if row_limit == 'all': limit = None else: limit = row_limit ### Manual limit to only lmo ### sample_set = SampleSet.query.filter(SampleSet.name == 'lmo')[0] sample_scilifelab_codes = [s.scilifelab_code for s in sample_set.samples] samples, table, complete_val_to_val = Taxon.rpkm_table(level=taxon_level, top_level_complete_values=parent_values, top_level=parent_level, samples=sample_scilifelab_codes, limit=limit) sorted_table = OrderedDict() for complete_taxon, sample_d in table.items(): new_sample_data = [] for sample in samples: new_sample_data.append(sample_d[sample]) sorted_table[complete_taxon] = new_sample_data return render_template('taxon_table.html', table=table, samples=samples, sorted_table=sorted_table, sample_scilifelab_codes = sample_scilifelab_codes, complete_val_to_val=complete_val_to_val, taxonomy_levels=taxonomy_levels, current_level=taxon_level, row_limit=row_limit )
def itis_search(request): xml_doc = _get_itis('searchForAnyMatch', request.GET) match_elements = xml_doc.xpath("//ns:return/ax:anyMatchList", namespaces=itis_namespaces) results = [] for e in match_elements: # TODO better way to transmute an etree element into a objectified # element o = objectify.fromstring(etree.tostring(e)) if not len(o): # no matches were returned break try: taxon = Taxon.objects.get(tsn=o.tsn) except: taxon = Taxon() taxon.tsn = o.tsn taxon.name = o.sciName taxon.itis_rank = itis_get_rank(o.tsn) taxon.rank = Taxon.ITIS_RANKS[taxon.itis_rank] # skip taxa above Order if taxon.rank > 2: continue common_names = [] for name in o.commonNameList.commonNames: if len(name): if name.language == 'English': common_names.append(unicode(name.commonName)) taxon.common_names = ', '.join(common_names) results.append(taxon) # render an HTML fragment that can be inserted into the taxon_import page return HttpResponse( render_to_string('taxons/itis_search_results.html', { 'results': results, 'MEDIA_URL': settings.MEDIA_URL, }), mimetype="text/plain", )
def taxon_tree_nodes(parent_level, parent_value): child_level, child_values = Taxon.tree_nodes(parent_level, parent_value) return render_template('taxon_tree_nodes.html', node_level=child_level, node_values=child_values)
def detect_super(name, rank): '''Find a taxon named name of rank rank''' try: return Taxon.get(Taxon.valid_name == name, Taxon.rank == rank).base_name except Taxon.DoesNotExist: return None
def read_file(filename): with codecs.open(filename, mode='r') as file: reader = csv.reader(file) first_line = reader.next() # maintain stack of taxa that are parents of the current taxon stack = [] # name of parent of root taxon should be in cell A1 root_name = first_line[0] if root_name != '': root_parent = Taxon.filter(Taxon.valid_name == root_name)[0] stack.append(root_parent) # current valid taxon (for synonyms) current_valid = None # whether current taxon should be marked as root of a page is_page_root = True error_occurred = False for row in reader: try: # ignore blank rows if row[14] == '' and row[8] == '': continue data = parse_row(row) # deal with "nHT", which is a pain. Some of these may need to be manually # readded to the DB if data['kind'] == 'nHT': if data['valid_name'] in ignored_nHT: continue else: raise Exception("Unrecognized nHT: " + str(data)) # nsgen is i.s., just ignore if data['kind'] == 'nsgen': continue if data['status'] == STATUS_VALID: # get stuff off the stack rank = data['rank'] if rank == ROOT: current_valid = Taxon.create( valid_name=data['valid_name'], age=data['age'], rank=data['rank'], is_page_root=True ) else: # TODO: make this somehow unranked-clade-aware while rank >= stack[-1].rank: stack.pop() # create new Taxon current_valid = Taxon.create( valid_name=data['valid_name'], age=data['age'], rank=data['rank'], parent=stack[-1], is_page_root=is_page_root ) if is_page_root: is_page_root = False stack.append(current_valid) # create new Name data['taxon'] = current_valid if data['status'] == STATUS_DUBIOUS: # current system is inadequate for properly marking si species # assume there's only genera and species if ' ' in data['valid_name']: data['group'] = GROUP_SPECIES else: data['group'] = GROUP_GENUS # si species don't have a meaningful "valid name", but preserve what's there now data['data']['si_valid_name'] = data['valid_name'] else: # this will be wrong in the few cases where a high-ranked name is listed # as a synonym of a family-group name. Don't see a way to correct that # programmatically. data['group'] = helpers.group_of_rank(current_valid.rank) if data['status'] == STATUS_SYNONYM: if data['kind'] == 'synHT': valid_name = data['valid_name'].split(' ', 1)[1] else: valid_name = data['valid_name'] if valid_name != current_valid.valid_name: raise Exception("Valid name of synonym does not match: " + data['valid_name'] + " and " + current_valid.valid_name) # shorten root name for family-group names if data['group'] == GROUP_FAMILY: data['root_name'] = helpers.strip_rank(data['root_name'], current_valid.rank) del data['kind'] data['data'] = json.dumps(remove_null(data['data'])) # Detect whether a name object is already present (Principle of Coordination) nm = None if data['status'] == STATUS_VALID: root_name = data['root_name'] if current_valid.rank == FAMILY: nm = detect_super(root_name + 'oidea', SUPERFAMILY) elif current_valid.rank == SUBFAMILY: nm = detect_super(root_name + 'idae', FAMILY) elif current_valid.rank == TRIBE: nm = detect_super(root_name + 'inae', SUBFAMILY) elif current_valid.rank == SUBTRIBE: nm = detect_super(root_name + 'ini', TRIBE) elif current_valid.rank == SUBGENUS: nm = detect_super(root_name, GENUS) elif current_valid.rank == SPECIES: spg_name = helpers.spg_of_species(current_valid.valid_name) nm = detect_super(spg_name, SPECIES_GROUP) elif current_valid.rank == SUBSPECIES and helpers.is_nominate_subspecies(current_valid.valid_name): sp_name = helpers.species_of_subspecies(current_valid.valid_name) nm = detect_super(sp_name, SPECIES) if nm is not None: del data['taxon'] nm.add_additional_data(data) # nm's Taxon should be the lowest-ranking one nm.taxon = current_valid # create a new Name if none was found if nm is None: nm = Name.create(**data) # set base_name field if data['status'] == STATUS_VALID: current_valid.base_name = nm if 'additional_synonyms' in data: group = helpers.group_of_rank(current_valid.rank) for synonym in data['additional_synonyms']: Name.create(taxon=current_valid, root_name=synonym, group=group, status=STATUS_SYNONYM) except Exception: traceback.print_exc() error_occurred = True # ignore error and happily go on with the next return not error_occurred
def create_root(): Taxon.create(rank=ROOT, valid_name='root', is_page_root=True)
def test_taxon_large_scale_rpkm_table(self): sample1 = Sample("P1993_101", None, None) sample2 = Sample("P1993_102", None, None) nr_samples = 2 taxons = [] for euk_i in range(2): for ph_i in range(3): for tc_i in range(20): taxons.append(Taxon(superkingdom="sk_{}".format(euk_i), phylum="ph_{}".format(ph_i), taxclass="tc_{}".format(tc_i))) self.session.add_all(taxons) self.session.commit() refresh_all_mat_views() for i,taxon in enumerate(taxons): count_mode = i % 3 gene_counts = [] gene1 = Gene("gene1{}".format(i), None, taxon_id=taxon.id) gene2 = Gene("gene2{}".format(i), None, taxon_id=taxon.id) if count_mode in [0,1]: gene_counts.append(GeneCount(gene1, sample1, 0.001)) gene_counts.append(GeneCount(gene1, sample2, 0.01)) if count_mode in [1,2]: gene_counts.append(GeneCount(gene2, sample1, 0.002)) gene_counts.append(GeneCount(gene2, sample2, 0.02)) self.session.add_all(gene_counts) self.session.add(gene1) self.session.add(gene2) self.session.commit() refresh_all_mat_views() samples, rows, complete_val_to_val = Taxon.rpkm_table() assert len(samples) == 2 assert len(rows) == 2 # Number of unique superkingdoms samples, rows, complete_val_to_val = Taxon.rpkm_table(level="phylum") assert len(samples) == 2 assert len(rows) == 6 # Number of unique down to phylum samples, rows, complete_val_to_val = Taxon.rpkm_table(level="taxclass") assert len(samples) == 2 assert len(rows) == 20 # Default limit samples, rows, complete_val_to_val = Taxon.rpkm_table(level="taxclass", limit=None) assert len(samples) == 2 assert len(rows) == 120 # Number of unique down to taxclass samples, rows, complete_val_to_val = Taxon.rpkm_table(level="taxclass", limit=None) for taxon, sample_d in rows.items(): # sample_d should be a ordered dict assert ["P1993_101", "P1993_102"] == [sample.scilifelab_code for sample in sample_d.keys()] rpkms = [[rpkm for sample, rpkm in sample_d.items()] for taxon, sample_d in rows.items()] rpkms_flat = [] for rpkm_row in rpkms: rpkms_flat += rpkm_row assert len(rpkms_flat) == 2 * 3 * 20 * nr_samples # Annotations sorted by total rpkm over all samples # and the rpkm values should be summed over all genes for that taxon # there should be roughly equal numbers of the three different counts for i, row in enumerate(rpkms[:40]): assert row == [0.003, 0.03] for row in rpkms[40:80]: assert row == [0.002, 0.02] for row in rpkms[80:120]: assert row == [0.001, 0.01] # possible to filter on specific level values at superkingdom for level_val in ["sk_0", "sk_1"]: samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=[level_val], top_level="superkingdom", level="phylum") assert len(rows) == 3 level_vals = [complete_val_to_val[complete_val] for complete_val in rows.keys()] assert level_vals == ["ph_2", "ph_0", "ph_1"] samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=[level_val], top_level="superkingdom", level="taxclass") assert len(rows) == 3*20 # possible to filter on specific level values at phylum for sk_level_val in ["sk_0", "sk_1"]: for ph_level_val in ["ph_0", "ph_1", "ph_2"]: top_level_complete_value="{};{}".format(sk_level_val, ph_level_val) samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=[top_level_complete_value], top_level="phylum", level="phylum") assert len(rows) == 1 level_vals = [complete_val_to_val[complete_val] for complete_val in rows.keys()] assert level_vals == [ph_level_val] samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=[top_level_complete_value], top_level="phylum", level="taxclass") assert len(rows) == 20 # possible to filter on multiple specific level values at phylum for sk_level_val in ["sk_0", "sk_1"]: for ph_level_vals in itertools.combinations(["ph_0", "ph_1", "ph_2"], 2): top_level_complete_values = [] for ph_level_val in ph_level_vals: top_level_complete_values.append("{};{}".format(sk_level_val, ph_level_val)) samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=top_level_complete_values, top_level="phylum", level="phylum") assert len(rows) == 2 level_vals = [complete_val_to_val[complete_val] for complete_val in rows.keys()] assert sorted(level_vals) == sorted(list(ph_level_vals)) samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=top_level_complete_values, top_level="phylum", level="taxclass") assert len(rows) == 40 # possible to filter on specific level values at taxclass for sk_level_val in ["sk_0", "sk_1"]: for ph_level_val in ["ph_0", "ph_1", "ph_2"]: for tc_level_val in ["tc_{}".format(i) for i in range(20)]: top_level_complete_value="{};{};{}".format(sk_level_val, ph_level_val, tc_level_val) samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=[top_level_complete_value], top_level="taxclass", level="taxclass") assert len(rows) == 1 # possible to filter on specific level values at taxclass for sk_level_val in ["sk_0", "sk_1"]: for ph_level_val in ["ph_0", "ph_1", "ph_2"]: for tc_level_vals in itertools.combinations(["tc_{}".format(i) for i in range(5)], 4): top_level_complete_values = [] for tc_level_val in tc_level_vals: top_level_complete_values.append("{};{};{}".format(sk_level_val, ph_level_val, tc_level_val)) samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=top_level_complete_values, top_level="taxclass", level="taxclass") assert len(rows) == 4 # possible to filter on samples for sample in [sample1, sample2]: samples, rows, complete_val_to_val = Taxon.rpkm_table(samples=[sample.scilifelab_code], level="taxclass", limit=None) assert len(rows) == 120 assert len(samples) == 1 assert samples[0] == sample for taxon, sample_d in rows.items(): assert list(sample_d.keys()) == [sample] rpkms = [[rpkm for sample, rpkm in sample_d.items()] for taxon, sample_d in rows.items()] if sample.scilifelab_code == "P1993_101": for i, row in enumerate(rpkms[:40]): assert row == [0.003] for row in rpkms[40:80]: assert row == [0.002] for row in rpkms[80:120]: assert row == [0.001] else: for row in rpkms[:40]: assert row == [0.03] for row in rpkms[40:80]: assert row == [0.02] for row in rpkms[80:120]: assert row == [0.01] # possible to filter on sample and taxon at the same time for sample in [sample1, sample2]: for sk_level_val in ["sk_0", "sk_1"]: top_level_complete_value = sk_level_val samples, rows, complete_val_to_val = Taxon.rpkm_table(samples=[sample.scilifelab_code], limit=None, top_level_complete_values=[top_level_complete_value], top_level="superkingdom", level="phylum") assert len(samples) == 1 assert samples[0] == sample for taxon, sample_d in rows.items(): assert list(sample_d.keys()) == [sample] assert len(rows) == 3 level_vals = [complete_val_to_val[complete_val] for complete_val in rows.keys()] assert level_vals == ["ph_2", "ph_0", "ph_1"] samples, rows, complete_val_to_val = Taxon.rpkm_table(samples=[sample.scilifelab_code], limit=None, top_level_complete_values=[top_level_complete_value], top_level="superkingdom", level="taxclass") assert len(rows) == 3*20 rpkms = [[rpkm for sample, rpkm in sample_d.items()] for annotation, sample_d in rows.items()] if sample.scilifelab_code == "P1993_101": for row in rpkms[:20]: assert row == [0.003] for row in rpkms[20:40]: assert row == [0.002] for row in rpkms[40:60]: assert row == [0.001] else: for row in rpkms[:20]: assert row == [0.03] for row in rpkms[20:40]: assert row == [0.02] for row in rpkms[40:80]: assert row == [0.01]
def test_taxon(self): ref_assembly = ReferenceAssembly("Version 1") gene1 = Gene("gene1", ref_assembly) sample1 = Sample("P1993_101", None, None) reference_assembly = ReferenceAssembly("version 1") gene_count1 = GeneCount(gene1, sample1, 0.001) taxon1 = Taxon(superkingdom="Bacteria", phylum="Proteobacteria") gene1.taxon = taxon1 self.session.add(gene1) self.session.add(taxon1) self.session.add(sample1) self.session.add(gene_count1) self.session.commit() gene1 = Gene.query.first() taxon1 = Taxon.query.first() assert gene1.taxon == taxon1 assert gene1 in taxon1.genes assert taxon1.superkingdom == 'Bacteria' assert taxon1.phylum == 'Proteobacteria' assert taxon1.taxclass == '' assert taxon1.full_taxonomy == 'Bacteria;Proteobacteria;;;;;;' refresh_all_mat_views() # Test sample count retreival sample2 = Sample("P1993_102", None, None) self.session.add(sample2) self.session.commit() refresh_all_mat_views() assert taxon1.rpkm == {sample1: 0.001} gene_count2 = GeneCount(gene1, sample2, 0.2) self.session.add(gene_count2) self.session.commit() refresh_all_mat_views() assert taxon1.rpkm == {sample1: 0.001, sample2: 0.2} gene2 = Gene("gene2", ref_assembly) gene_count3 = GeneCount(gene2, sample2, 0.1) self.session.add(gene2) self.session.add(gene_count3) self.session.commit() refresh_all_mat_views() # taxon1.rpkm should still be the same since the new gene is not connected to taxon1 assert taxon1.rpkm == {sample1: 0.001, sample2: 0.2} taxon2 = Taxon(superkingdom="Eukaryota", phylum="Chlorophyta") gene2.taxon = taxon2 self.session.add(taxon2) self.session.add(gene2) self.session.commit() refresh_all_mat_views() # Taxon2 should have gene_count3 stats only assert taxon2.rpkm == {sample2: 0.1} gene3 = Gene("gene3", ref_assembly, taxon_id=taxon1.id) gene_count4 = GeneCount(gene3, sample1, 1.0) self.session.add(gene3) self.session.add(gene_count4) self.session.commit() # Taxon1 should now have the original stats plus gene_count4 assert taxon1.rpkm == {sample1: 1.001, sample2: 0.2} taxon3 = Taxon(superkingdom="Eukaryota", phylum="Unnamed", taxclass="Dinophyceae") self.session.add(taxon3) self.session.commit() gene4 = Gene("gene4", ref_assembly, taxon_id=taxon3.id) gene_count5 = GeneCount(gene4, sample2, 0.003) self.session.add(gene4) self.session.add(gene_count5) self.session.commit() refresh_all_mat_views() # theoretical rpkm_table: # samples = [sample1, sample2] # rpkm_table = {"Bacteria": {"P1993_101": 1.001, "P1993_102": 0.2}, "Eukaryota": {"P1993_102": 0.103}} samples, rpkm_table, complete_val_to_val = Taxon.rpkm_table() assert samples == [sample1, sample2] assert [complete_val_to_val[complete_level_val] for complete_level_val in rpkm_table.keys()] == ["Bacteria", "Eukaryota"] # Sorted by summed rpkm assert rpkm_table[("Bacteria")] == {sample1: 1.001, sample2: 0.2} assert rpkm_table[("Eukaryota")] == {sample2: 0.103} samples, rpkm_table, complete_val_to_val= Taxon.rpkm_table(level='phylum') assert samples == [sample1, sample2] assert [complete_val_to_val[complete_level_val] for complete_level_val in rpkm_table.keys()] == ["Proteobacteria", "Chlorophyta", "Unnamed"] # Sorted by summed rpkm assert rpkm_table[("Bacteria;Proteobacteria")] == {sample1: 1.001, sample2: 0.2} assert rpkm_table[("Eukaryota;Chlorophyta")] == {sample2: 0.1} assert rpkm_table[("Eukaryota;Unnamed")] == {sample2: 0.003}