def tearDown(self):
     # remove supertaxa or we'll get a CascadedDeleteError
     for t in self.apes:
         t.supertaxon = None
         t.save()
     for t in self.apes:
         Taxon.delete(t)
Example #2
0
def taxonomy_tree():
    node_level = "superkingdom"
    node_values= Taxon.top_entry_taxa()
    return render_template('taxon_tree.html',
            node_level = node_level,
            node_values = node_values
        )
def add_taxon(tsn):
    #http://www.itis.gov/ITISWebService/services/ITISService/getScientificNameFromTSN?tsn=531894
    
    taxon = Taxon(tsn=tsn)
    
    sci_name_xml = _get_itis('getScientificNameFromTSN', {'tsn': tsn})
    sci_name = None
    for i in range(1,5):
        this_name = sci_name_xml.xpath("//ns:return/ax:unitName%d" % i, namespaces=itis_namespaces)[0].text
        if this_name:
            sci_name = this_name
    taxon.name = sci_name
    
    rank_name_xml = _get_itis('getTaxonomicRankNameFromTSN', {'tsn': tsn})
    rank_name = rank_name_xml.xpath("//ns:return/ax:rankName", namespaces=itis_namespaces)[0].text
    taxon.rank = Taxon.ITIS_RANKS[rank_name]

    supertaxon_xml = _get_itis('getParentTSNFromTSN', {'tsn': tsn})
    supertaxon_tsn = supertaxon_xml.xpath("//ns:return/ax:parentTsn", namespaces=itis_namespaces)[0]
    supertaxon_tsn = int(supertaxon_tsn.text)
    q = Taxon.objects.filter(tsn=supertaxon_tsn)
    if q.exists():
        taxon.supertaxon = q[0]
    
    common_names_xml = _get_itis('getCommonNamesFromTSN', {'tsn': tsn})
    common_names_elements = common_names_xml.xpath("//ns:return/ax:commonNames", namespaces=itis_namespaces)
    common_names = []
    for name in common_names_elements:
        name = objectify.fromstring(etree.tostring(name))

        nil_key = '{%s}nil' % itis_namespaces['xsi']
        if nil_key in name.attrib:
            if name.attrib[nil_key] == 'true':
                continue

        if not len(name):
            continue

        if name.language != "English":
            continue

        common_names.append(unicode(name.commonName))
    taxon.common_names = ", ".join(common_names)
        
    return taxon
Example #4
0
def taxon_table():
    taxonomy_levels = Taxon.level_order

    taxon_level = request.args.get('taxon_level', 'superkingdom')
    parent_values = request.args.getlist('parent_values[]', None)
    parent_level = request.args.get('parent_level', None)
    row_limit = request.args.get('row_limit', 20)

    if taxon_level not in taxonomy_levels:
        taxon_level = 'superkingdom'
    if parent_level not in taxonomy_levels:
        parent_values = None
    if row_limit not in ['20', '50', '100', 'all']:
        row_limit = 20

    # Translate to model language
    if row_limit == 'all':
        limit = None
    else:
        limit = row_limit

    ### Manual limit to only lmo ###
    sample_set = SampleSet.query.filter(SampleSet.name == 'lmo')[0]
    sample_scilifelab_codes = [s.scilifelab_code for s in sample_set.samples]

    samples, table, complete_val_to_val = Taxon.rpkm_table(level=taxon_level, top_level_complete_values=parent_values, top_level=parent_level, samples=sample_scilifelab_codes, limit=limit)
    sorted_table = OrderedDict()
    for complete_taxon, sample_d in table.items():
        new_sample_data = []
        for sample in samples:
            new_sample_data.append(sample_d[sample])
        sorted_table[complete_taxon] = new_sample_data

    return render_template('taxon_table.html',
            table=table,
            samples=samples,
            sorted_table=sorted_table,
            sample_scilifelab_codes = sample_scilifelab_codes,
            complete_val_to_val=complete_val_to_val,
            taxonomy_levels=taxonomy_levels,
            current_level=taxon_level,
            row_limit=row_limit
        )
def itis_search(request):
    xml_doc = _get_itis('searchForAnyMatch', request.GET)
    
    match_elements = xml_doc.xpath("//ns:return/ax:anyMatchList", namespaces=itis_namespaces)
    
    results = []
    for e in match_elements:
        # TODO better way to transmute an etree element into a objectified 
        # element
        o = objectify.fromstring(etree.tostring(e))
        if not len(o): # no matches were returned
            break
        
        try:
            taxon = Taxon.objects.get(tsn=o.tsn)
        except:
            taxon = Taxon()
        
            taxon.tsn = o.tsn
            taxon.name = o.sciName
            taxon.itis_rank = itis_get_rank(o.tsn)
            taxon.rank = Taxon.ITIS_RANKS[taxon.itis_rank]

        # skip taxa above Order
        if taxon.rank > 2:
            continue
        
        common_names = []
        for name in o.commonNameList.commonNames:
            if len(name):
                if name.language == 'English':
                    common_names.append(unicode(name.commonName))
        taxon.common_names = ', '.join(common_names)
        
        results.append(taxon)
    
    # render an HTML fragment that can be inserted into the taxon_import page
    return HttpResponse(
        render_to_string('taxons/itis_search_results.html', {
            'results': results,
            'MEDIA_URL': settings.MEDIA_URL,
        }),
        mimetype="text/plain",
    )
Example #6
0
def taxon_tree_nodes(parent_level, parent_value):
    child_level, child_values = Taxon.tree_nodes(parent_level, parent_value)
    return render_template('taxon_tree_nodes.html',
                    node_level=child_level,
                    node_values=child_values)
Example #7
0
def detect_super(name, rank):
	'''Find a taxon named name of rank rank'''
	try:
		return Taxon.get(Taxon.valid_name == name, Taxon.rank == rank).base_name
	except Taxon.DoesNotExist:
		return None
Example #8
0
def read_file(filename):
	with codecs.open(filename, mode='r') as file:
		reader = csv.reader(file)
		first_line = reader.next()

		# maintain stack of taxa that are parents of the current taxon
		stack = []
		# name of parent of root taxon should be in cell A1
		root_name = first_line[0]
		if root_name != '':
			root_parent = Taxon.filter(Taxon.valid_name == root_name)[0]
			stack.append(root_parent)

		# current valid taxon (for synonyms)
		current_valid = None
		# whether current taxon should be marked as root of a page
		is_page_root = True
		error_occurred = False
		for row in reader:
			try:
				# ignore blank rows
				if row[14] == '' and row[8] == '':
					continue
				data = parse_row(row)
				# deal with "nHT", which is a pain. Some of these may need to be manually
				# readded to the DB
				if data['kind'] == 'nHT':
					if data['valid_name'] in ignored_nHT:
						continue
					else:
						raise Exception("Unrecognized nHT: " + str(data))
				# nsgen is i.s., just ignore
				if data['kind'] == 'nsgen':
					continue

				if data['status'] == STATUS_VALID:
					# get stuff off the stack
					rank = data['rank']
					if rank == ROOT:
						current_valid = Taxon.create(
							valid_name=data['valid_name'], age=data['age'],
							rank=data['rank'], is_page_root=True
						)
					else:
						# TODO: make this somehow unranked-clade-aware
						while rank >= stack[-1].rank:
							stack.pop()
						# create new Taxon
						current_valid = Taxon.create(
							valid_name=data['valid_name'], age=data['age'],
							rank=data['rank'], parent=stack[-1], is_page_root=is_page_root
						)
					if is_page_root:
						is_page_root = False
					stack.append(current_valid)
				# create new Name
				data['taxon'] = current_valid
				if data['status'] == STATUS_DUBIOUS:
					# current system is inadequate for properly marking si species
					# assume there's only genera and species
					if ' ' in data['valid_name']:
						data['group'] = GROUP_SPECIES
					else:
						data['group'] = GROUP_GENUS
					# si species don't have a meaningful "valid name", but preserve what's there now
					data['data']['si_valid_name'] = data['valid_name']
				else:
					# this will be wrong in the few cases where a high-ranked name is listed
					# as a synonym of a family-group name. Don't see a way to correct that
					# programmatically.
					data['group'] = helpers.group_of_rank(current_valid.rank)
					if data['status'] == STATUS_SYNONYM:
						if data['kind'] == 'synHT':
							valid_name = data['valid_name'].split(' ', 1)[1]
						else:
							valid_name = data['valid_name']
						if valid_name != current_valid.valid_name:
							raise Exception("Valid name of synonym does not match: " + data['valid_name'] + " and " + current_valid.valid_name)
				# shorten root name for family-group names
				if data['group'] == GROUP_FAMILY:
					data['root_name'] = helpers.strip_rank(data['root_name'], current_valid.rank)
				del data['kind']
				data['data'] = json.dumps(remove_null(data['data']))

				# Detect whether a name object is already present (Principle of Coordination)
				nm = None
				if data['status'] == STATUS_VALID:
					root_name = data['root_name']
					if current_valid.rank == FAMILY:
						nm = detect_super(root_name + 'oidea', SUPERFAMILY)
					elif current_valid.rank == SUBFAMILY:
						nm = detect_super(root_name + 'idae', FAMILY)
					elif current_valid.rank == TRIBE:
						nm = detect_super(root_name + 'inae', SUBFAMILY)
					elif current_valid.rank == SUBTRIBE:
						nm = detect_super(root_name + 'ini', TRIBE)
					elif current_valid.rank == SUBGENUS:
						nm = detect_super(root_name, GENUS)
					elif current_valid.rank == SPECIES:
						spg_name = helpers.spg_of_species(current_valid.valid_name)
						nm = detect_super(spg_name, SPECIES_GROUP)
					elif current_valid.rank == SUBSPECIES and helpers.is_nominate_subspecies(current_valid.valid_name):
						sp_name = helpers.species_of_subspecies(current_valid.valid_name)
						nm = detect_super(sp_name, SPECIES)
					if nm is not None:
						del data['taxon']
						nm.add_additional_data(data)
						# nm's Taxon should be the lowest-ranking one
						nm.taxon = current_valid

				# create a new Name if none was found
				if nm is None:
					nm = Name.create(**data)

				# set base_name field
				if data['status'] == STATUS_VALID:
					current_valid.base_name = nm
				if 'additional_synonyms' in data:
					group = helpers.group_of_rank(current_valid.rank)
					for synonym in data['additional_synonyms']:
						Name.create(taxon=current_valid, root_name=synonym, group=group, status=STATUS_SYNONYM)

			except Exception:
				traceback.print_exc()
				error_occurred = True
				# ignore error and happily go on with the next
	return not error_occurred
Example #9
0
def create_root():
	Taxon.create(rank=ROOT, valid_name='root', is_page_root=True)
Example #10
0
    def test_taxon_large_scale_rpkm_table(self):
        sample1 = Sample("P1993_101", None, None)
        sample2 = Sample("P1993_102", None, None)
        nr_samples = 2
        taxons = []
        for euk_i in range(2):
            for ph_i in range(3):
                for tc_i in range(20):
                    taxons.append(Taxon(superkingdom="sk_{}".format(euk_i),
                        phylum="ph_{}".format(ph_i),
                        taxclass="tc_{}".format(tc_i)))

        self.session.add_all(taxons)
        self.session.commit()
        refresh_all_mat_views()

        for i,taxon in enumerate(taxons):
            count_mode = i % 3
            gene_counts = []

            gene1 = Gene("gene1{}".format(i), None, taxon_id=taxon.id)
            gene2 = Gene("gene2{}".format(i), None, taxon_id=taxon.id)

            if count_mode in [0,1]:
                gene_counts.append(GeneCount(gene1, sample1, 0.001))
                gene_counts.append(GeneCount(gene1, sample2, 0.01))
            if count_mode in [1,2]:
                gene_counts.append(GeneCount(gene2, sample1, 0.002))
                gene_counts.append(GeneCount(gene2, sample2, 0.02))

            self.session.add_all(gene_counts)

            self.session.add(gene1)
            self.session.add(gene2)

        self.session.commit()
        refresh_all_mat_views()

        samples, rows, complete_val_to_val = Taxon.rpkm_table()
        assert len(samples) == 2
        assert len(rows) == 2 # Number of unique superkingdoms

        samples, rows, complete_val_to_val = Taxon.rpkm_table(level="phylum")
        assert len(samples) == 2
        assert len(rows) == 6 # Number of unique down to phylum

        samples, rows, complete_val_to_val = Taxon.rpkm_table(level="taxclass")
        assert len(samples) == 2
        assert len(rows) == 20 # Default limit

        samples, rows, complete_val_to_val = Taxon.rpkm_table(level="taxclass", limit=None)
        assert len(samples) == 2
        assert len(rows) == 120 # Number of unique down to taxclass

        samples, rows, complete_val_to_val = Taxon.rpkm_table(level="taxclass", limit=None)

        for taxon, sample_d in rows.items():
            # sample_d should be a ordered dict
            assert ["P1993_101", "P1993_102"] == [sample.scilifelab_code for sample in sample_d.keys()]
        rpkms = [[rpkm for sample, rpkm in sample_d.items()] for taxon, sample_d in rows.items()]

        rpkms_flat = []
        for rpkm_row in rpkms:
            rpkms_flat += rpkm_row

        assert len(rpkms_flat) == 2 * 3 * 20 * nr_samples

        # Annotations sorted by total rpkm over all samples
        # and the rpkm values should be summed over all genes for that taxon
        # there should be roughly equal numbers of the three different counts
        for i, row in enumerate(rpkms[:40]):
            assert row == [0.003, 0.03]
        for row in rpkms[40:80]:
            assert row == [0.002, 0.02]
        for row in rpkms[80:120]:
            assert row == [0.001, 0.01]

        # possible to filter on specific level values at superkingdom
        for level_val in ["sk_0", "sk_1"]:
            samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=[level_val], top_level="superkingdom", level="phylum")
            assert len(rows) == 3
            level_vals = [complete_val_to_val[complete_val] for complete_val in rows.keys()]
            assert level_vals == ["ph_2", "ph_0", "ph_1"]
            samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=[level_val], top_level="superkingdom", level="taxclass")
            assert len(rows) == 3*20


        # possible to filter on specific level values at phylum
        for sk_level_val in ["sk_0", "sk_1"]:
            for ph_level_val in ["ph_0", "ph_1", "ph_2"]:
                top_level_complete_value="{};{}".format(sk_level_val, ph_level_val)
                samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=[top_level_complete_value], top_level="phylum", level="phylum")
                assert len(rows) == 1
                level_vals = [complete_val_to_val[complete_val] for complete_val in rows.keys()]
                assert level_vals == [ph_level_val]
                samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=[top_level_complete_value], top_level="phylum", level="taxclass")
                assert len(rows) == 20

        # possible to filter on multiple specific level values at phylum
        for sk_level_val in ["sk_0", "sk_1"]:
            for ph_level_vals in itertools.combinations(["ph_0", "ph_1", "ph_2"], 2):
                top_level_complete_values = []
                for ph_level_val in ph_level_vals:
                    top_level_complete_values.append("{};{}".format(sk_level_val, ph_level_val))
                samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=top_level_complete_values, top_level="phylum", level="phylum")
                assert len(rows) == 2
                level_vals = [complete_val_to_val[complete_val] for complete_val in rows.keys()]
                assert sorted(level_vals) == sorted(list(ph_level_vals))
                samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=top_level_complete_values, top_level="phylum", level="taxclass")
                assert len(rows) == 40

        # possible to filter on specific level values at taxclass
        for sk_level_val in ["sk_0", "sk_1"]:
            for ph_level_val in ["ph_0", "ph_1", "ph_2"]:
                for tc_level_val in ["tc_{}".format(i) for i in range(20)]:
                    top_level_complete_value="{};{};{}".format(sk_level_val, ph_level_val, tc_level_val)
                    samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=[top_level_complete_value], top_level="taxclass", level="taxclass")
                    assert len(rows) == 1

        # possible to filter on specific level values at taxclass
        for sk_level_val in ["sk_0", "sk_1"]:
            for ph_level_val in ["ph_0", "ph_1", "ph_2"]:
                for tc_level_vals in itertools.combinations(["tc_{}".format(i) for i in range(5)], 4):
                    top_level_complete_values = []
                    for tc_level_val in tc_level_vals:
                        top_level_complete_values.append("{};{};{}".format(sk_level_val, ph_level_val, tc_level_val))
                    samples, rows, complete_val_to_val = Taxon.rpkm_table(limit=None, top_level_complete_values=top_level_complete_values, top_level="taxclass", level="taxclass")
                    assert len(rows) == 4

        # possible to filter on samples
        for sample in [sample1, sample2]:
            samples, rows, complete_val_to_val = Taxon.rpkm_table(samples=[sample.scilifelab_code], level="taxclass", limit=None)
            assert len(rows) == 120
            assert len(samples) == 1
            assert samples[0] == sample
            for taxon, sample_d in rows.items():
                assert list(sample_d.keys()) == [sample]

            rpkms = [[rpkm for sample, rpkm in sample_d.items()] for taxon, sample_d in rows.items()]
            if sample.scilifelab_code == "P1993_101":
                for i, row in enumerate(rpkms[:40]):
                    assert row == [0.003]
                for row in rpkms[40:80]:
                    assert row == [0.002]
                for row in rpkms[80:120]:
                    assert row == [0.001]
            else:
                for row in rpkms[:40]:
                    assert row == [0.03]
                for row in rpkms[40:80]:
                    assert row == [0.02]
                for row in rpkms[80:120]:
                    assert row == [0.01]

        # possible to filter on sample and taxon at the same time
        for sample in [sample1, sample2]:
            for sk_level_val in ["sk_0", "sk_1"]:
                top_level_complete_value = sk_level_val
                samples, rows, complete_val_to_val = Taxon.rpkm_table(samples=[sample.scilifelab_code], limit=None, top_level_complete_values=[top_level_complete_value], top_level="superkingdom", level="phylum")
                assert len(samples) == 1
                assert samples[0] == sample
                for taxon, sample_d in rows.items():
                    assert list(sample_d.keys()) == [sample]

                assert len(rows) == 3
                level_vals = [complete_val_to_val[complete_val] for complete_val in rows.keys()]
                assert level_vals == ["ph_2", "ph_0", "ph_1"]
                samples, rows, complete_val_to_val = Taxon.rpkm_table(samples=[sample.scilifelab_code], limit=None, top_level_complete_values=[top_level_complete_value], top_level="superkingdom", level="taxclass")
                assert len(rows) == 3*20


                rpkms = [[rpkm for sample, rpkm in sample_d.items()] for annotation, sample_d in rows.items()]
                if sample.scilifelab_code == "P1993_101":
                    for row in rpkms[:20]:
                        assert row == [0.003]
                    for row in rpkms[20:40]:
                        assert row == [0.002]
                    for row in rpkms[40:60]:
                        assert row == [0.001]
                else:
                    for row in rpkms[:20]:
                        assert row == [0.03]
                    for row in rpkms[20:40]:
                        assert row == [0.02]
                    for row in rpkms[40:80]:
                        assert row == [0.01]
Example #11
0
    def test_taxon(self):
        ref_assembly = ReferenceAssembly("Version 1")
        gene1 = Gene("gene1", ref_assembly)

        sample1 = Sample("P1993_101", None, None)
        reference_assembly = ReferenceAssembly("version 1")
        gene_count1 = GeneCount(gene1, sample1, 0.001)
        taxon1 = Taxon(superkingdom="Bacteria", phylum="Proteobacteria")
        gene1.taxon = taxon1
        self.session.add(gene1)
        self.session.add(taxon1)
        self.session.add(sample1)
        self.session.add(gene_count1)
        self.session.commit()

        gene1 = Gene.query.first()
        taxon1 = Taxon.query.first()

        assert gene1.taxon == taxon1
        assert gene1 in taxon1.genes
        assert taxon1.superkingdom == 'Bacteria'
        assert taxon1.phylum == 'Proteobacteria'
        assert taxon1.taxclass == ''
        assert taxon1.full_taxonomy == 'Bacteria;Proteobacteria;;;;;;'
        refresh_all_mat_views()

        # Test sample count retreival
        sample2 = Sample("P1993_102", None, None)
        self.session.add(sample2)
        self.session.commit()
        refresh_all_mat_views()
        assert taxon1.rpkm == {sample1: 0.001}

        gene_count2 = GeneCount(gene1, sample2, 0.2)
        self.session.add(gene_count2)
        self.session.commit()
        refresh_all_mat_views()
        assert taxon1.rpkm == {sample1: 0.001, sample2: 0.2}

        gene2 = Gene("gene2", ref_assembly)
        gene_count3 = GeneCount(gene2, sample2, 0.1)

        self.session.add(gene2)
        self.session.add(gene_count3)
        self.session.commit()
        refresh_all_mat_views()

        # taxon1.rpkm should still be the same since the new gene is not connected to taxon1
        assert taxon1.rpkm == {sample1: 0.001, sample2: 0.2}

        taxon2 = Taxon(superkingdom="Eukaryota", phylum="Chlorophyta")
        gene2.taxon = taxon2
        self.session.add(taxon2)
        self.session.add(gene2)
        self.session.commit()
        refresh_all_mat_views()

        # Taxon2 should have gene_count3 stats only
        assert taxon2.rpkm == {sample2: 0.1}

        gene3 = Gene("gene3", ref_assembly, taxon_id=taxon1.id)
        gene_count4 = GeneCount(gene3, sample1, 1.0)

        self.session.add(gene3)
        self.session.add(gene_count4)
        self.session.commit()

        # Taxon1 should now have the original stats plus gene_count4
        assert taxon1.rpkm == {sample1: 1.001, sample2: 0.2}


        taxon3 = Taxon(superkingdom="Eukaryota", phylum="Unnamed", taxclass="Dinophyceae")
        self.session.add(taxon3)
        self.session.commit()
        gene4 = Gene("gene4", ref_assembly, taxon_id=taxon3.id)
        gene_count5 = GeneCount(gene4, sample2, 0.003)

        self.session.add(gene4)
        self.session.add(gene_count5)
        self.session.commit()
        refresh_all_mat_views()

        # theoretical rpkm_table:
        # samples = [sample1, sample2]
        # rpkm_table = {"Bacteria": {"P1993_101": 1.001, "P1993_102": 0.2}, "Eukaryota": {"P1993_102": 0.103}}
        samples, rpkm_table, complete_val_to_val = Taxon.rpkm_table()
        assert samples == [sample1, sample2]
        assert [complete_val_to_val[complete_level_val] for complete_level_val in rpkm_table.keys()] == ["Bacteria", "Eukaryota"] # Sorted by summed rpkm
        assert rpkm_table[("Bacteria")] == {sample1: 1.001, sample2: 0.2}
        assert rpkm_table[("Eukaryota")] == {sample2: 0.103}

        samples, rpkm_table, complete_val_to_val= Taxon.rpkm_table(level='phylum')
        assert samples == [sample1, sample2]
        assert [complete_val_to_val[complete_level_val] for complete_level_val in rpkm_table.keys()] == ["Proteobacteria", "Chlorophyta", "Unnamed"] # Sorted by summed rpkm

        assert rpkm_table[("Bacteria;Proteobacteria")] == {sample1: 1.001, sample2: 0.2}
        assert rpkm_table[("Eukaryota;Chlorophyta")] == {sample2: 0.1}
        assert rpkm_table[("Eukaryota;Unnamed")] == {sample2: 0.003}