def new_nexus_without_sites(nexus_obj, sites_to_remove): """ Returns a new NexusReader instance with the sites in `sites_to_remove` removed. :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :param sites_to_remove: A list of site numbers :type sites_to_remove: List :return: A NexusWriter instance :raises AssertionError: if nexus_obj is not a nexus :raises NexusFormatException: if nexus_obj does not have a `data` block """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) # make new nexus nexout = NexusWriter() nexout.add_comment( "Removed %d sites: %s" % (len(sites_to_remove), ",".join(["%s" % s for s in sites_to_remove]))) new_sitepos = 0 for sitepos in range(nexus_obj.data.nchar): if sitepos in sites_to_remove: continue # skip! for taxon, data in nexus_obj.data: nexout.add(taxon, new_sitepos, data[sitepos]) new_sitepos += 1 return nexout
def find_constant_sites(nexus_obj): """ Returns a list of the constant sites in a nexus :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :return: A list of constant site positions. :raises AssertionError: if nexus_obj is not a nexus :raises NexusFormatException: if nexus_obj does not have a `data` block """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) const = [] for i in range(0, nexus_obj.data.nchar): states = [] for taxa, data in nexus_obj.data: c = data[i] if c in ('?', '-'): continue # pragma: no cover elif c not in states: states.append(c) if len(states) == 1: const.append(i) return const
def count_binary_set_size(nexus_obj): """ Counts the number of sites by their size (i.e. how many sites have two members, etc) Returns a dictionary of the set size and count :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :return: A Dictionary :raises AssertionError: if nexus_obj is not a nexus :raises NexusFormatException: if nexus_obj does not have a `data` block e.g. { 0: 0, 1: 100, 2: 20, } """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) tally = Counter() for char_id in nexus_obj.data.characters: char = nexus_obj.data.characters[char_id] tally[len([v for v in char.values() if v == '1'])] += 1 return tally
def tally_by_site(nexus_obj): """ Counts the number of taxa per state per site (i.e. site 1 has three taxa coded as "A", and 1 taxa coded as "G") Returns a dictionary of the cognate sets by members in the nexus :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :return: A Dictionary :raises AssertionError: if nexus_obj is not a nexus :raises NexusFormatException: if nexus_obj does not have a `data` block e.g. { 'site1': {'state1': ['taxon1', 'taxon2'], 'state0': ['taxon3'], } 'site2': {'state1': ['taxon2'], 'state0': ['taxon1', 'taxon3'], } } """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) tally = {} for site, data in nexus_obj.data.characters.items(): tally[site] = tally.get(site, {}) for taxon, state in data.items(): tally[site][state] = tally[site].get(state, []) tally[site][state].append(taxon) return tally
def find_unique_sites(nexus_obj): """ Returns a list of the unique sites in a binary nexus i.e. sites with only one taxon belonging to them. (this only really makes sense if the data is coded as presence/absence) :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :return: A list of unique site positions. :raises AssertionError: if nexus_obj is not a nexus :raises NexusFormatException: if nexus_obj does not have a `data` block """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) unique = [] for i in range(0, nexus_obj.data.nchar): members = {} missing = 0 for taxa, characters in nexus_obj.data: c = characters[i] if c in ('?', '-'): missing += 1 else: members[c] = members.get(c, 0) + 1 # a character is unique if there's only two states # AND there's a state with 1 member # AND the state with 1 member is NOT the 0 (absence) state if len(members) == 2: for state, count in members.items(): if state != '0' and count == 1: unique.append(i) return unique
def count_site_values(nexus_obj, characters=('-', '?')): """ Counts the number of sites with values in `characters` in a nexus :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :param characters: An iterable of the characters to count :type characters: tuple :return: (A dictionary of taxa and missing counts, and a log) :raises AssertionError: if nexus_obj is not a nexus :raises NexusFormatException: if nexus_obj does not have a `data` block """ if not isinstance(characters, Iterable): raise TypeError("characters should be iterable") check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) tally = {} for taxon, sites in nexus_obj.data: tally[taxon] = tally.get(taxon, 0) for site in sites: if site in characters: tally[taxon] += 1 return tally
def count_site_values(nexus_obj, characters=('-', '?')): """ Counts the number of sites with values in `characters` in a nexus :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :param characters: An iterable of the characters to count :type characters: tuple :return: (A dictionary of taxa and missing counts, and a log) :raises AssertionError: if nexus_obj is not a nexus :raises NexusFormatException: if nexus_obj does not have a `data` block """ if not isinstance(characters, Iterable): raise TypeError("characters should be iterable") check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) tally = {} for taxon, sites in nexus_obj.data: tally[taxon] = tally.get(taxon, 0) for site in sites: if site in characters: tally[taxon] += 1 return tally
def find_unique_sites(nexus_obj): """ Returns a list of the unique sites in a binary nexus i.e. sites with only one taxon belonging to them. (this only really makes sense if the data is coded as presence/absence) :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :return: A list of unique site positions. :raises AssertionError: if nexus_obj is not a nexus :raises NexusFormatException: if nexus_obj does not have a `data` block """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) unique = [] for i in range(0, nexus_obj.data.nchar): members = {} missing = 0 for taxa, characters in nexus_obj.data: c = characters[i] if c in ('?', '-'): missing += 1 else: members[c] = members.get(c, 0) + 1 # a character is unique if there's only two states # AND there's a state with 1 member # AND the state with 1 member is NOT the 0 (absence) state if len(members) == 2: for state, count in members.items(): if state != '0' and count == 1: unique.append(i) return unique
def find_constant_sites(nexus_obj): """ Returns a list of the constant sites in a nexus :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :return: A list of constant site positions. :raises AssertionError: if nexus_obj is not a nexus :raises NexusFormatException: if nexus_obj does not have a `data` block """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) const = [] for i in range(0, nexus_obj.data.nchar): states = [] for taxa, data in nexus_obj.data: c = data[i] if c in ('?', '-'): continue # pragma: no cover elif c not in states: states.append(c) if len(states) == 1: const.append(i) return const
def count_binary_set_size(nexus_obj): """ Counts the number of sites by their size (i.e. how many sites have two members, etc) Returns a dictionary of the set size and count :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :return: A Dictionary :raises AssertionError: if nexus_obj is not a nexus :raises NexusFormatException: if nexus_obj does not have a `data` block e.g. { 0: 0, 1: 100, 2: 20, } """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) tally = Counter() for char_id in nexus_obj.data.characters: char = nexus_obj.data.characters[char_id] tally[len([v for v in char.values() if v == '1'])] += 1 return tally
def tally_by_taxon(nexus_obj): """ Counts the number of states per site that each taxon has (i.e. taxon 1 has three sites coded as "A" and 1 coded as "G") Returns a dictionary of the cognate sets by members in the nexus :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :return: A Dictionary :raises AssertionError: if nexus_obj is not a nexus :raises NexusFormatException: if nexus_obj does not have a `data` block e.g. { 'taxon1': {'state1': ['site1', 'site2'], 'state0': ['site3'], } 'taxon2': {'state1': ['site2'], 'state0': ['site1', 'site3'], } } """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) tally = {} for taxon, characters in nexus_obj.data: tally[taxon] = {} for pos, char in enumerate(characters): label = nexus_obj.data.charlabels.get(pos, pos) tally[taxon][char] = tally[taxon].get(char, []) tally[taxon][char].append(label) return tally
def tally_by_site(nexus_obj): """ Counts the number of taxa per state per site (i.e. site 1 has three taxa coded as "A", and 1 taxa coded as "G") Returns a dictionary of the cognate sets by members in the nexus :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :return: A Dictionary :raises AssertionError: if nexus_obj is not a nexus :raises NexusFormatException: if nexus_obj does not have a `data` block e.g. { 'site1': {'state1': ['taxon1', 'taxon2'], 'state0': ['taxon3'], } 'site2': {'state1': ['taxon2'], 'state0': ['taxon1', 'taxon3'], } } """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) tally = {} for site, data in nexus_obj.data.characters.items(): tally[site] = tally.get(site, {}) for taxon, state in data.items(): tally[site][state] = tally[site].get(state, []) tally[site][state].append(taxon) return tally
def tally_by_taxon(nexus_obj): """ Counts the number of states per site that each taxon has (i.e. taxon 1 has three sites coded as "A" and 1 coded as "G") Returns a dictionary of the cognate sets by members in the nexus :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :return: A Dictionary :raises AssertionError: if nexus_obj is not a nexus :raises NexusFormatException: if nexus_obj does not have a `data` block e.g. { 'taxon1': {'state1': ['site1', 'site2'], 'state0': ['site3'], } 'taxon2': {'state1': ['site2'], 'state0': ['site1', 'site3'], } } """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) tally = {} for taxon, characters in nexus_obj.data: tally[taxon] = {} for pos, char in enumerate(characters): label = nexus_obj.data.charlabels.get(pos, pos) tally[taxon][char] = tally[taxon].get(char, []) tally[taxon][char].append(label) return tally
def new_nexus_without_sites(nexus_obj, sites_to_remove): """ Returns a new NexusReader instance with the sites in `sites_to_remove` removed. :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :param sites_to_remove: A list of site numbers :type sites_to_remove: List :return: A NexusWriter instance :raises AssertionError: if nexus_obj is not a nexus :raises NexusFormatException: if nexus_obj does not have a `data` block """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) # make new nexus nexout = NexusWriter() nexout.add_comment( "Removed %d sites: %s" % (len(sites_to_remove), ",".join(["%s" % s for s in sites_to_remove])) ) new_sitepos = 0 for sitepos in range(nexus_obj.data.nchar): if sitepos in sites_to_remove: continue # skip! for taxon, data in nexus_obj.data: nexout.add(taxon, new_sitepos, data[sitepos]) new_sitepos += 1 return nexout
def binarise(nexus_obj, one_nexus_per_block=False, keep_zero=False): """ Returns a binary variant of the given `nexus_obj`. If `one_nexus_per_block` then we return a list of NexusWriter instances. :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :param one_nexus_per_block: Whether to return a single NexusWriter, or a list of NexusWriter's (one per character) :type one_nexus_per_block: Boolean :param keep_zero: A boolean flag denoting whether to treat '0' as a missing state or not. The default (False) is to ignore '0' as a trait absence. Setting this to True will treat '0' as a unique state. :type keep_zero: Boolean :return: A NexusWriter instance or a list of NexusWriter instances. :raises AssertionError: if nexus_obj is not a nexus :raises NexusFormatException: if nexus_obj does not have a `data` block """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) nexuslist = [] n = NexusWriter() for i in sorted(nexus_obj.data.charlabels): label = nexus_obj.data.charlabels[i] # character label char = nexus_obj.data.characters[label] # character dict recoding = _recode_to_binary(char, keep_zero) # recode new_char_length = len(recoding[list(recoding.keys())[0]]) # loop over recoded data for j in range(new_char_length): for taxon, state in recoding.items(): # make new label new_label = "%s_%d" % (str(label), j) # add to nexus n.add(taxon, new_label, state[j]) if one_nexus_per_block: nexuslist.append(n) n = NexusWriter() if one_nexus_per_block: return nexuslist else: return n
def shufflenexus(nexus_obj, resample=False): """ Shuffles the characters between each taxon to create a new nexus :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :param resample: The number of characters to resample. If set to False, then the number of characters will equal the number of characters in the original data file. :type resample: Integer :return: A shuffled NexusReader instance :raises AssertionError: if nexus_obj is not a nexus :raises ValueError: if resample is not False or a positive Integer :raises NexusFormatException: if nexus_obj does not have a `data` block """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) if resample is False: resample = nexus_obj.data.nchar try: resample = int(resample) except ValueError: raise ValueError('resample must be a positive integer or False!') if resample < 1: raise ValueError('resample must be a positive integer or False!') newnexus = NexusWriter() newnexus.add_comment( "Randomised Nexus generated from %s" % nexus_obj.filename ) for i in range(resample): # pick existing character character = randrange(0, nexus_obj.data.nchar) chars = nexus_obj.data.characters[character] site_values = [chars[taxon] for taxon in nexus_obj.data.taxa] shuffle(site_values) for taxon in nexus_obj.data.taxa: newnexus.add(taxon, i, site_values.pop(0)) return newnexus
def shufflenexus(nexus_obj, resample=False): """ Shuffles the characters between each taxon to create a new nexus :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :param resample: The number of characters to resample. If set to False, then the number of characters will equal the number of characters in the original data file. :type resample: Integer :return: A shuffled NexusReader instance :raises AssertionError: if nexus_obj is not a nexus :raises ValueError: if resample is not False or a positive Integer :raises NexusFormatException: if nexus_obj does not have a `data` block """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) if resample is False: resample = nexus_obj.data.nchar try: resample = int(resample) except ValueError: raise ValueError('resample must be a positive integer or False!') if resample < 1: raise ValueError('resample must be a positive integer or False!') newnexus = NexusWriter() newnexus.add_comment("Randomised Nexus generated from %s" % nexus_obj.filename) for i in range(resample): # pick existing character character = randrange(0, nexus_obj.data.nchar) chars = nexus_obj.data.characters[character] site_values = [chars[taxon] for taxon in nexus_obj.data.taxa] shuffle(site_values) for taxon in nexus_obj.data.taxa: newnexus.add(taxon, i, site_values.pop(0)) return newnexus
def combine_nexuses(nexuslist): """ Combines a list of NexusReader instances into a single nexus :param nexuslist: A list of NexusReader instances :type nexuslist: List :return: A NexusWriter instance :raises TypeError: if nexuslist is not a list of NexusReader instances :raises IOError: if unable to read an file in nexuslist :raises NexusFormatException: if a nexus file does not have a `data` block """ out = NexusWriter() charpos = 0 for nex_id, nex in enumerate(nexuslist, 1): check_for_valid_NexusReader(nex, required_blocks=['data']) if hasattr(nex, 'short_filename'): nexus_label = os.path.splitext(nex.short_filename)[0] elif hasattr(nex, 'label'): nexus_label = nex.label else: nexus_label = str(nex_id) out.add_comment( "%d - %d: %s" % (charpos, charpos + nex.data.nchar - 1, nexus_label) ) for site_idx, site in enumerate(sorted(nex.data.characters), 0): data = nex.data.characters.get(site) charpos += 1 # work out character label charlabel = nex.data.charlabels.get(site_idx, site_idx + 1) label = '%s.%s' % (nexus_label, charlabel) for taxon, value in data.items(): out.add(taxon, label, value) return out
def check_zeros(nexus_obj, absences=None, missing=None): """ Checks for sites in the nexus that are coded as all empty. Returns a list of sites that are completely empty. Note that this is zero-indexed (i.e. the first site is site 0 not 1) to enable indexing of nexus.data.matrix/.character lists :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :param absences: A list of values to be marked as absent. Default = ["0"] :type char: list :param missing: A list of values to be marked as missing. Default = ["-", "?"] :type char: list :return: A list of site indexes :raises ValueError: if any of the states in the `char` dictionary is not a string (i.e. integer or None values) """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) absences = absences if absences else ['0'] missing = missing if missing else ['-', '?'] bad = [] for site_idx in range(0, nexus_obj.data.nchar): states = Counter([nexus_obj.data.matrix[t][site_idx] for t in nexus_obj.data.matrix]) zeros = sum([ states[k] for k in states if k in absences or k in missing ]) total = sum(states.values()) if zeros == total: bad.append(site_idx) return bad
def check_zeros(nexus_obj, absences=None, missing=None): """ Checks for sites in the nexus that are coded as all empty. Returns a list of sites that are completely empty. Note that this is zero-indexed (i.e. the first site is site 0 not 1) to enable indexing of nexus.data.matrix/.character lists :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :param absences: A list of values to be marked as absent. Default = ["0"] :type char: list :param missing: A list of values to be marked as missing. Default = ["-", "?"] :type char: list :return: A list of site indexes :raises ValueError: if any of the states in the `char` dictionary is not a string (i.e. integer or None values) """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) absences = absences if absences else ['0'] missing = missing if missing else ['-', '?'] bad = [] for site_idx in range(0, nexus_obj.data.nchar): states = Counter([nexus_obj.data.matrix[t][site_idx] for t in nexus_obj.data.matrix]) zeros = sum([ states[k] for k in states if k in absences or k in missing ]) total = sum(states.values()) if zeros == total: bad.append(site_idx) return bad
def binarise(nexus_obj, keep_zero=False): """ Returns a binary variant of the given `nexus_obj`. If `one_nexus_per_block` then we return a list of NexusWriter instances. :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :param keep_zero: A boolean flag denoting whether to treat '0' as a missing state or not. The default (False) is to ignore '0' as a trait absence. Setting this to True will treat '0' as a unique state. :type keep_zero: Boolean :return: A NexusWriter instance or a list of NexusWriter instances. :raises AssertionError: if nexus_obj is not a nexus :raises NexusFormatException: if nexus_obj does not have a `data` block """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) n = NexusWriter() for i in sorted(nexus_obj.data.charlabels): label = nexus_obj.data.charlabels[i] # character label char = nexus_obj.data.characters[label] # character dict recoding = _recode_to_binary(char, keep_zero) # recode new_char_length = len(recoding[list(recoding.keys())[0]]) # loop over recoded data for j in range(new_char_length): for taxon, state in recoding.items(): # make new label new_label = "%s_%d" % (str(label), j) # add to nexus n.add(taxon, new_label, state[j]) return n
def multistatise(nexus_obj, charlabel=None): """ Returns a multistate variant of the given `nexus_obj`. :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :return: A NexusReader instance :raises AssertionError: if nexus_obj is not a nexus :raises NexusFormatException: if nexus_obj does not have a `data` block """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) if not charlabel: charlabel = getattr(nexus_obj, 'short_filename', 1) states = {} for taxon in nexus_obj.data.matrix: states[taxon] = [] sequence = nexus_obj.data.matrix[taxon] for site_idx, value in enumerate(sequence): if site_idx > 26: raise ValueError( "Too many characters to handle! - run out of A-Z" ) assert value == str(value), "%r is not a string" % value if value == '1': states[taxon].append(chr(65 + site_idx)) nexout = NexusWriter() for taxon in states: if len(states[taxon]) == 0: nexout.add(taxon, charlabel, '?') else: for s in states[taxon]: nexout.add(taxon, charlabel, s) return nexout._convert_to_reader()
def multistatise(nexus_obj, charlabel=None): """ Returns a multistate variant of the given `nexus_obj`. :param nexus_obj: A `NexusReader` instance :type nexus_obj: NexusReader :return: A NexusReader instance :raises AssertionError: if nexus_obj is not a nexus :raises NexusFormatException: if nexus_obj does not have a `data` block """ check_for_valid_NexusReader(nexus_obj, required_blocks=['data']) if not charlabel: charlabel = getattr(nexus_obj, 'short_filename', 1) states = {} for taxon in nexus_obj.data.matrix: states[taxon] = [] sequence = nexus_obj.data.matrix[taxon] for site_idx, value in enumerate(sequence): if site_idx > 26: raise ValueError( "Too many characters to handle! - run out of A-Z") assert value == str(value), "%r is not a string" % value if value == '1': states[taxon].append(chr(65 + site_idx)) nexout = NexusWriter() for taxon in states: if not states[taxon]: nexout.add(taxon, charlabel, '?') else: for s in states[taxon]: nexout.add(taxon, charlabel, s) return nexout._convert_to_reader()
def combine_nexuses(nexuslist): """ Combines a list of NexusReader instances into a single nexus :param nexuslist: A list of NexusReader instances :type nexuslist: List :return: A NexusWriter instance :raises TypeError: if nexuslist is not a list of NexusReader instances :raises IOError: if unable to read an file in nexuslist :raises NexusFormatException: if a nexus file does not have a `data` block """ out = NexusWriter() charpos = 0 for nex_id, nex in enumerate(nexuslist, 1): check_for_valid_NexusReader(nex, required_blocks=['data']) if hasattr(nex, 'short_filename'): nexus_label = os.path.splitext(nex.short_filename)[0] else: nexus_label = str(nex_id) out.add_comment("%d - %d: %s" % (charpos, charpos + nex.data.nchar - 1, nexus_label)) for site_idx, site in enumerate(sorted(nex.data.characters), 0): data = nex.data.characters.get(site) charpos += 1 # work out character label charlabel = nex.data.charlabels.get(site_idx, site_idx + 1) label = '%s.%s' % (nexus_label, charlabel) for taxon, value in data.items(): out.add(taxon, label, value) return out
def test_failure_on_required_block_two(self): nexus_obj = NexusReader(os.path.join(EXAMPLE_DIR, 'example2.nex')) with self.assertRaises(NexusFormatException): check_for_valid_NexusReader(nexus_obj, ['r8s'])
def test_valid_with_required_block_two(self): nexus_obj = NexusReader(os.path.join(EXAMPLE_DIR, 'example2.nex')) check_for_valid_NexusReader(nexus_obj, ['data', 'taxa'])
def test_valid_with_required_block_two(self): nexus_obj = NexusReader(os.path.join(EXAMPLE_DIR, 'example2.nex')) check_for_valid_NexusReader(nexus_obj, ['data', 'taxa'])
def test_failure_on_required_block_two(self): nexus_obj = NexusReader(os.path.join(EXAMPLE_DIR, 'example2.nex')) with self.assertRaises(NexusFormatException): check_for_valid_NexusReader(nexus_obj, ['r8s'])
def test_valid_NexusReader(self): check_for_valid_NexusReader(NexusReader())
def test_valid_NexusReader(self): check_for_valid_NexusReader(NexusReader())