def main(self, tree_filename, tree_format='newick', ids=None): col_delimiter = '\t|\t' row_delimiter = '\t|\n' url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz' # download the taxonomy archive filename = self.download_file(url) # extract the text dump for extract in ('nodes.dmp', 'names.dmp'): if os.path.exists(os.path.join(self.data_dir, extract)): print 'Using existing copy of %s' % extract else: print 'Extracting %s from %s...' % (extract, filename) archive = tarfile.open(name=filename, mode='r:gz') archive.extract(extract, path=self.data_dir) archive.close() # get names for all tax_ids from names.dmp print 'Getting names...' scientific_names = {} other_names = defaultdict(set) with open(os.path.join(self.data_dir, 'names.dmp')) as names_file: for line in names_file: line = line.rstrip(row_delimiter) values = line.split(col_delimiter) tax_id, name_txt, _, name_type = values[:4] if name_type == 'scientific name': scientific_names[tax_id] = name_txt else: other_names[tax_id].add(name_txt) # read all node info from nodes.dmp print 'Reading taxonomy...' nodes = {} with open(os.path.join(self.data_dir, 'nodes.dmp')) as nodes_file: for line in nodes_file: line = line.rstrip(row_delimiter) values = line.split(col_delimiter) tax_id, parent_id = values[:2] if ids: this_node = BaseTree.Clade(name=tax_id) else: this_node = BaseTree.Clade(name=scientific_names[tax_id]) nodes[tax_id] = this_node this_node.parent_id = parent_id if tree_format == 'cdao': # add common names, synonyms, mispellings, etc. as skos:altLabels if not hasattr(this_node, 'tu_attributes'): this_node.tu_attributes = [] for x in other_names[tax_id]: this_node.tu_attributes.append( ('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(x))) print 'Found %s OTUs.' % len(nodes) # create tree from nodes dictionary print 'Building tree...' for node_id, this_node in nodes.iteritems(): if node_id == this_node.parent_id: root_node = this_node print 'Found root.' else: parent_node = nodes[this_node.parent_id] parent_node.clades.append(this_node) del this_node.parent_id tree = BaseTree.Tree(root=root_node) # write tree to file print 'Writing %s tree to %s...' % (tree_format, tree_filename) bp.write([tree], tree_filename, tree_format) print 'Done!'
def main(self, tree_filename, tree_format='newick'): col_delimiter = '|' url = 'http://www.itis.gov/downloads/itisMySQLTables.tar.gz' # download the taxonomy archive filename = self.download_file(url) # extract the tables for extract in ('taxonomic_units', 'longnames', 'synonym_links', 'vernaculars'): if os.path.exists(os.path.join(self.data_dir, extract)): print 'Using existing copy of %s' % extract else: print 'Extracting %s from %s...' % (extract, filename) archive = tarfile.open(name=filename, mode='r:gz') full_extract = [ x for x in archive.getnames() if x.split('/')[-1] == extract ][0] member = archive.getmember(full_extract) member.name = extract archive.extract(extract, path=self.data_dir) archive.close() # get names for all ITIS TSNs from longnames table print 'Getting names...' names = {} with open(os.path.join(self.data_dir, 'longnames')) as names_file: for line in names_file: line = line.strip() values = line.split(col_delimiter) tax_id, name = values names[tax_id] = name # read all node info from taxonomic_units print 'Reading taxonomy...' nodes = {} with open(os.path.join(self.data_dir, 'taxonomic_units')) as nodes_file: for line in nodes_file: line = line.strip() values = line.split(col_delimiter) (tax_id, usage, parent_id, uncertain_parent) = [values[n] for n in (0, 10, 17, 23)] #if uncertain_parent: continue if not usage in ('accepted', 'valid'): continue name = names[tax_id] this_node = BaseTree.Clade(name=name) nodes[tax_id] = this_node this_node.parent_id = parent_id other_names = defaultdict(set) if tree_format == 'cdao': # get synonym definitions print 'Getting synonyms...' with open(os.path.join(self.data_dir, 'synonym_links')) as synonym_file: for line in synonym_file: line = line.strip() values = line.split(col_delimiter) node_id, syn_id, _ = values nodes[node_id] = ('synonym', names[node_id], syn_id) with open(os.path.join(self.data_dir, 'vernaculars')) as synonym_file: for line in synonym_file: line = line.strip() values = line.split(col_delimiter) tax_id, name = values[:2] other_names[tax_id].add(name) print 'Found %s OTUs.' % len(nodes) nodes['0'] = root_node = BaseTree.Clade() # create tree from nodes dictionary print 'Building tree...' for node_id, this_node in nodes.iteritems(): if node_id == '0': continue if isinstance(this_node, BaseTree.Clade): try: parent_node = nodes[this_node.parent_id] parent_node.clades.append(this_node) except (KeyError, AttributeError): continue del this_node.parent_id if not hasattr(this_node, 'tu_attributes'): this_node.tu_attributes = [] for name in other_names[node_id]: this_node.tu_attributes.append( ('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(name))) elif this_node[0] == 'synonym': _, name, syn_id = this_node try: accepted_node = nodes[syn_id] except KeyError: continue if not isinstance(accepted_node, BaseTree.Clade): continue if not hasattr(accepted_node, 'tu_attributes'): nodes[syn_id].tu_attributes = [] nodes[syn_id].tu_attributes.append( ('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(name))) #print 'Synonym: %s -> %s' % (name, nodes[syn_id].name) tree = BaseTree.Tree(root=root_node) # write tree to file print 'Writing %s tree to %s...' % (tree_format, tree_filename) bp.write([tree], tree_filename, tree_format) print 'Done!' ''
def main(self, tree_filename, tree_format='newick'): col_delimiter = '\t' url = 'http://ecat-dev.gbif.org/repository/export/checklist1.zip' # download the taxonomy archive filename = self.download_file(url) # extract the tables extract = 'taxon.txt' if os.path.exists(os.path.join(self.data_dir, extract)): print 'Using existing copy of %s' % extract else: print 'Extracting %s from %s...' % (extract, filename) archive = zipfile.ZipFile(filename, mode='r') archive.extract(extract, path=self.data_dir) archive.close() # build BioPython clades print 'Reading taxonomy...' nodes = {} with open(os.path.join(self.data_dir, 'taxon.txt')) as taxonomy_file: for line in taxonomy_file: line = line.strip() values = line.split(col_delimiter) id, parent_id, syn_id, _, name, _, status = values[:7] # skip incertae sedis taxa if id == '0': continue if syn_id and not 'synonym' in status: continue elif syn_id and 'synonym' in status: if tree_format == 'cdao': nodes[id] = ('synonym', name, syn_id) elif not syn_id: nodes[id] = BaseTree.Clade(name=name) nodes[id].parent_id = parent_id print 'Found %s OTUs.' % len(nodes) nodes[''] = root_node = BaseTree.Clade() # create tree from nodes dictionary print 'Building tree...' for node_id, this_node in nodes.iteritems(): if not node_id: continue if isinstance(this_node, BaseTree.Clade): try: parent_node = nodes[this_node.parent_id] parent_node.clades.append(this_node) del this_node.parent_id except (KeyError, AttributeError): pass elif this_node[0] == 'synonym': _, name, syn_id = this_node try: accepted_node = nodes[syn_id] except KeyError: continue if not isinstance(accepted_node, BaseTree.Clade): continue if not hasattr(accepted_node, 'tu_attributes'): nodes[syn_id].tu_attributes = [] nodes[syn_id].tu_attributes.append( ('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(name))) #print 'Synonym: %s -> %s' % (name, nodes[syn_id].name) tree = BaseTree.Tree(root=root_node) # write tree to file print 'Writing %s tree to %s...' % (tree_format, tree_filename) bp.write([tree], tree_filename, tree_format) print 'Done!' ''
def main(self, tree_filename, tree_format='newick'): col_delimiter = '|' url = 'http://www.itis.gov/downloads/itisMySQLTables.tar.gz' # download the taxonomy archive filename = self.download_file(url) # extract the tables for extract in ('taxonomic_units', 'longnames', 'synonym_links', 'vernaculars'): if os.path.exists(os.path.join(self.data_dir, extract)): print 'Using existing copy of %s' % extract else: print 'Extracting %s from %s...' % (extract, filename) archive = tarfile.open(name=filename, mode='r:gz') full_extract = [x for x in archive.getnames() if x.split('/')[-1] == extract][0] member = archive.getmember(full_extract) member.name = extract archive.extract(extract, path=self.data_dir) archive.close() # get names for all ITIS TSNs from longnames table print 'Getting names...' names = {} with open(os.path.join(self.data_dir, 'longnames')) as names_file: for line in names_file: line = line.strip() values = line.split(col_delimiter) tax_id, name = values names[tax_id] = name # read all node info from taxonomic_units print 'Reading taxonomy...' nodes = {} with open(os.path.join(self.data_dir, 'taxonomic_units')) as nodes_file: for line in nodes_file: line = line.strip() values = line.split(col_delimiter) (tax_id, usage, parent_id, uncertain_parent) = [values[n] for n in (0, 10, 17, 23)] #if uncertain_parent: continue if not usage in ('accepted', 'valid'): continue name = names[tax_id] this_node = BaseTree.Clade(name=name) nodes[tax_id] = this_node this_node.parent_id = parent_id other_names = defaultdict(set) if tree_format == 'cdao': # get synonym definitions print 'Getting synonyms...' with open(os.path.join(self.data_dir, 'synonym_links')) as synonym_file: for line in synonym_file: line = line.strip() values = line.split(col_delimiter) node_id, syn_id, _ = values nodes[node_id] = ('synonym', names[node_id], syn_id) with open(os.path.join(self.data_dir, 'vernaculars')) as synonym_file: for line in synonym_file: line = line.strip() values = line.split(col_delimiter) tax_id, name = values[:2] other_names[tax_id].add(name) print 'Found %s OTUs.' % len(nodes) nodes['0'] = root_node = BaseTree.Clade() # create tree from nodes dictionary print 'Building tree...' for node_id, this_node in nodes.iteritems(): if node_id == '0': continue if isinstance(this_node, BaseTree.Clade): try: parent_node = nodes[this_node.parent_id] parent_node.clades.append(this_node) except (KeyError, AttributeError): continue del this_node.parent_id if not hasattr(this_node, 'tu_attributes'): this_node.tu_attributes = [] for name in other_names[node_id]: this_node.tu_attributes.append(('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(name))) elif this_node[0] == 'synonym': _, name, syn_id = this_node try: accepted_node = nodes[syn_id] except KeyError: continue if not isinstance(accepted_node, BaseTree.Clade): continue if not hasattr(accepted_node, 'tu_attributes'): nodes[syn_id].tu_attributes = [] nodes[syn_id].tu_attributes.append(('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(name))) #print 'Synonym: %s -> %s' % (name, nodes[syn_id].name) tree = BaseTree.Tree(root=root_node) # write tree to file print 'Writing %s tree to %s...' % (tree_format, tree_filename) bp.write([tree], tree_filename, tree_format) print 'Done!'''
def main(self, tree_filename, tree_format='newick'): col_delimiter = '\t' url = 'http://ecat-dev.gbif.org/repository/export/checklist1.zip' # download the taxonomy archive filename = self.download_file(url) # extract the tables extract = 'taxon.txt' if os.path.exists(os.path.join(self.data_dir, extract)): print 'Using existing copy of %s' % extract else: print 'Extracting %s from %s...' % (extract, filename) archive = zipfile.ZipFile(filename, mode='r') archive.extract(extract, path=self.data_dir) archive.close() # build BioPython clades print 'Reading taxonomy...' nodes = {} with open(os.path.join(self.data_dir, 'taxon.txt')) as taxonomy_file: for line in taxonomy_file: line = line.strip() values = line.split(col_delimiter) id, parent_id, syn_id, _, name, _, status = values[:7] # skip incertae sedis taxa if id == '0': continue if syn_id and not 'synonym' in status: continue elif syn_id and 'synonym' in status: if tree_format == 'cdao': nodes[id] = ('synonym', name, syn_id) elif not syn_id: nodes[id] = BaseTree.Clade(name=name) nodes[id].parent_id = parent_id print 'Found %s OTUs.' % len(nodes) nodes[''] = root_node = BaseTree.Clade() # create tree from nodes dictionary print 'Building tree...' for node_id, this_node in nodes.iteritems(): if not node_id: continue if isinstance(this_node, BaseTree.Clade): try: parent_node = nodes[this_node.parent_id] parent_node.clades.append(this_node) del this_node.parent_id except (KeyError, AttributeError): pass elif this_node[0] == 'synonym': _, name, syn_id = this_node try: accepted_node = nodes[syn_id] except KeyError: continue if not isinstance(accepted_node, BaseTree.Clade): continue if not hasattr(accepted_node, 'tu_attributes'): nodes[syn_id].tu_attributes = [] nodes[syn_id].tu_attributes.append(('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(name))) #print 'Synonym: %s -> %s' % (name, nodes[syn_id].name) tree = BaseTree.Tree(root=root_node) # write tree to file print 'Writing %s tree to %s...' % (tree_format, tree_filename) bp.write([tree], tree_filename, tree_format) print 'Done!'''
def main(self, tree_filename, tree_format='newick', ids=None): col_delimiter = '\t|\t' row_delimiter = '\t|\n' url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz' # download the taxonomy archive filename = self.download_file(url) # extract the text dump for extract in ('nodes.dmp', 'names.dmp'): if os.path.exists(os.path.join(self.data_dir, extract)): print 'Using existing copy of %s' % extract else: print 'Extracting %s from %s...' % (extract, filename) archive = tarfile.open(name=filename, mode='r:gz') archive.extract(extract, path=self.data_dir) archive.close() # get names for all tax_ids from names.dmp print 'Getting names...' scientific_names = {} other_names = defaultdict(set) with open(os.path.join(self.data_dir, 'names.dmp')) as names_file: for line in names_file: line = line.rstrip(row_delimiter) values = line.split(col_delimiter) tax_id, name_txt, _, name_type = values[:4] if name_type == 'scientific name': scientific_names[tax_id] = name_txt else: other_names[tax_id].add(name_txt) # read all node info from nodes.dmp print 'Reading taxonomy...' nodes = {} with open(os.path.join(self.data_dir, 'nodes.dmp')) as nodes_file: for line in nodes_file: line = line.rstrip(row_delimiter) values = line.split(col_delimiter) tax_id, parent_id = values[:2] if ids: this_node = BaseTree.Clade(name=tax_id) else: this_node = BaseTree.Clade(name=scientific_names[tax_id]) nodes[tax_id] = this_node this_node.parent_id = parent_id if tree_format == 'cdao': # add common names, synonyms, mispellings, etc. as skos:altLabels if not hasattr(this_node, 'tu_attributes'): this_node.tu_attributes = [] for x in other_names[tax_id]: this_node.tu_attributes.append(('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(x))) print 'Found %s OTUs.' % len(nodes) # create tree from nodes dictionary print 'Building tree...' for node_id, this_node in nodes.iteritems(): if node_id == this_node.parent_id: root_node = this_node print 'Found root.' else: parent_node = nodes[this_node.parent_id] parent_node.clades.append(this_node) del this_node.parent_id tree = BaseTree.Tree(root=root_node) # write tree to file print 'Writing %s tree to %s...' % (tree_format, tree_filename) bp.write([tree], tree_filename, tree_format) print 'Done!'