Example #1
0
    def main(self, tree_filename, tree_format='newick', ids=None):
        col_delimiter = '\t|\t'
        row_delimiter = '\t|\n'
        url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'

        # download the taxonomy archive
        filename = self.download_file(url)

        # extract the text dump
        for extract in ('nodes.dmp', 'names.dmp'):
            if os.path.exists(os.path.join(self.data_dir, extract)):
                print 'Using existing copy of %s' % extract
            else:
                print 'Extracting %s from %s...' % (extract, filename)
                archive = tarfile.open(name=filename, mode='r:gz')
                archive.extract(extract, path=self.data_dir)
                archive.close()

        # get names for all tax_ids from names.dmp
        print 'Getting names...'
        scientific_names = {}
        other_names = defaultdict(set)
        with open(os.path.join(self.data_dir, 'names.dmp')) as names_file:
            for line in names_file:
                line = line.rstrip(row_delimiter)
                values = line.split(col_delimiter)
                tax_id, name_txt, _, name_type = values[:4]
                if name_type == 'scientific name':
                    scientific_names[tax_id] = name_txt
                else:
                    other_names[tax_id].add(name_txt)

        # read all node info from nodes.dmp
        print 'Reading taxonomy...'
        nodes = {}
        with open(os.path.join(self.data_dir, 'nodes.dmp')) as nodes_file:
            for line in nodes_file:
                line = line.rstrip(row_delimiter)
                values = line.split(col_delimiter)
                tax_id, parent_id = values[:2]
                if ids:
                    this_node = BaseTree.Clade(name=tax_id)
                else:
                    this_node = BaseTree.Clade(name=scientific_names[tax_id])

                nodes[tax_id] = this_node
                this_node.parent_id = parent_id

                if tree_format == 'cdao':
                    # add common names, synonyms, mispellings, etc. as skos:altLabels
                    if not hasattr(this_node, 'tu_attributes'):
                        this_node.tu_attributes = []
                    for x in other_names[tax_id]:
                        this_node.tu_attributes.append(
                            ('<http://www.w3.org/2004/02/skos/core#altLabel>',
                             Taxonomy.format_rdf_string(x)))

        print 'Found %s OTUs.' % len(nodes)

        # create tree from nodes dictionary
        print 'Building tree...'
        for node_id, this_node in nodes.iteritems():
            if node_id == this_node.parent_id:
                root_node = this_node
                print 'Found root.'
            else:
                parent_node = nodes[this_node.parent_id]
                parent_node.clades.append(this_node)

            del this_node.parent_id

        tree = BaseTree.Tree(root=root_node)

        # write tree to file
        print 'Writing %s tree to %s...' % (tree_format, tree_filename)
        bp.write([tree], tree_filename, tree_format)

        print 'Done!'
Example #2
0
    def main(self, tree_filename, tree_format='newick'):
        col_delimiter = '|'
        url = 'http://www.itis.gov/downloads/itisMySQLTables.tar.gz'

        # download the taxonomy archive
        filename = self.download_file(url)

        # extract the tables
        for extract in ('taxonomic_units', 'longnames', 'synonym_links',
                        'vernaculars'):
            if os.path.exists(os.path.join(self.data_dir, extract)):
                print 'Using existing copy of %s' % extract
            else:
                print 'Extracting %s from %s...' % (extract, filename)
                archive = tarfile.open(name=filename, mode='r:gz')
                full_extract = [
                    x for x in archive.getnames()
                    if x.split('/')[-1] == extract
                ][0]
                member = archive.getmember(full_extract)
                member.name = extract
                archive.extract(extract, path=self.data_dir)
                archive.close()

        # get names for all ITIS TSNs from longnames table
        print 'Getting names...'
        names = {}
        with open(os.path.join(self.data_dir, 'longnames')) as names_file:
            for line in names_file:
                line = line.strip()
                values = line.split(col_delimiter)
                tax_id, name = values
                names[tax_id] = name

        # read all node info from taxonomic_units
        print 'Reading taxonomy...'
        nodes = {}
        with open(os.path.join(self.data_dir,
                               'taxonomic_units')) as nodes_file:
            for line in nodes_file:
                line = line.strip()
                values = line.split(col_delimiter)

                (tax_id, usage, parent_id,
                 uncertain_parent) = [values[n] for n in (0, 10, 17, 23)]

                #if uncertain_parent: continue
                if not usage in ('accepted', 'valid'): continue

                name = names[tax_id]
                this_node = BaseTree.Clade(name=name)
                nodes[tax_id] = this_node
                this_node.parent_id = parent_id

        other_names = defaultdict(set)
        if tree_format == 'cdao':
            # get synonym definitions
            print 'Getting synonyms...'
            with open(os.path.join(self.data_dir,
                                   'synonym_links')) as synonym_file:
                for line in synonym_file:
                    line = line.strip()
                    values = line.split(col_delimiter)
                    node_id, syn_id, _ = values
                    nodes[node_id] = ('synonym', names[node_id], syn_id)
            with open(os.path.join(self.data_dir,
                                   'vernaculars')) as synonym_file:
                for line in synonym_file:
                    line = line.strip()
                    values = line.split(col_delimiter)
                    tax_id, name = values[:2]
                    other_names[tax_id].add(name)

        print 'Found %s OTUs.' % len(nodes)
        nodes['0'] = root_node = BaseTree.Clade()

        # create tree from nodes dictionary
        print 'Building tree...'
        for node_id, this_node in nodes.iteritems():
            if node_id == '0': continue

            if isinstance(this_node, BaseTree.Clade):
                try:
                    parent_node = nodes[this_node.parent_id]
                    parent_node.clades.append(this_node)

                except (KeyError, AttributeError):
                    continue

                del this_node.parent_id

                if not hasattr(this_node, 'tu_attributes'):
                    this_node.tu_attributes = []
                for name in other_names[node_id]:
                    this_node.tu_attributes.append(
                        ('<http://www.w3.org/2004/02/skos/core#altLabel>',
                         Taxonomy.format_rdf_string(name)))

            elif this_node[0] == 'synonym':
                _, name, syn_id = this_node
                try:
                    accepted_node = nodes[syn_id]
                except KeyError:
                    continue

                if not isinstance(accepted_node, BaseTree.Clade): continue

                if not hasattr(accepted_node, 'tu_attributes'):
                    nodes[syn_id].tu_attributes = []
                nodes[syn_id].tu_attributes.append(
                    ('<http://www.w3.org/2004/02/skos/core#altLabel>',
                     Taxonomy.format_rdf_string(name)))
                #print 'Synonym: %s -> %s' % (name, nodes[syn_id].name)

        tree = BaseTree.Tree(root=root_node)

        # write tree to file
        print 'Writing %s tree to %s...' % (tree_format, tree_filename)
        bp.write([tree], tree_filename, tree_format)

        print 'Done!' ''
Example #3
0
    def main(self, tree_filename, tree_format='newick'):
        col_delimiter = '\t'
        url = 'http://ecat-dev.gbif.org/repository/export/checklist1.zip'

        # download the taxonomy archive
        filename = self.download_file(url)

        # extract the tables
        extract = 'taxon.txt'
        if os.path.exists(os.path.join(self.data_dir, extract)):
            print 'Using existing copy of %s' % extract
        else:
            print 'Extracting %s from %s...' % (extract, filename)
            archive = zipfile.ZipFile(filename, mode='r')
            archive.extract(extract, path=self.data_dir)
            archive.close()

        # build BioPython clades
        print 'Reading taxonomy...'
        nodes = {}
        with open(os.path.join(self.data_dir, 'taxon.txt')) as taxonomy_file:
            for line in taxonomy_file:
                line = line.strip()
                values = line.split(col_delimiter)
                id, parent_id, syn_id, _, name, _, status = values[:7]

                # skip incertae sedis taxa
                if id == '0': continue

                if syn_id and not 'synonym' in status:
                    continue
                elif syn_id and 'synonym' in status:
                    if tree_format == 'cdao':
                        nodes[id] = ('synonym', name, syn_id)
                elif not syn_id:
                    nodes[id] = BaseTree.Clade(name=name)
                    nodes[id].parent_id = parent_id

        print 'Found %s OTUs.' % len(nodes)
        nodes[''] = root_node = BaseTree.Clade()

        # create tree from nodes dictionary
        print 'Building tree...'
        for node_id, this_node in nodes.iteritems():
            if not node_id: continue

            if isinstance(this_node, BaseTree.Clade):
                try:
                    parent_node = nodes[this_node.parent_id]
                    parent_node.clades.append(this_node)
                    del this_node.parent_id
                except (KeyError, AttributeError):
                    pass

            elif this_node[0] == 'synonym':
                _, name, syn_id = this_node
                try:
                    accepted_node = nodes[syn_id]
                except KeyError:
                    continue

                if not isinstance(accepted_node, BaseTree.Clade): continue

                if not hasattr(accepted_node, 'tu_attributes'):
                    nodes[syn_id].tu_attributes = []
                nodes[syn_id].tu_attributes.append(
                    ('<http://www.w3.org/2004/02/skos/core#altLabel>',
                     Taxonomy.format_rdf_string(name)))
                #print 'Synonym: %s -> %s' % (name, nodes[syn_id].name)

        tree = BaseTree.Tree(root=root_node)

        # write tree to file
        print 'Writing %s tree to %s...' % (tree_format, tree_filename)
        bp.write([tree], tree_filename, tree_format)

        print 'Done!' ''
Example #4
0
    def main(self, tree_filename, tree_format='newick'):
        col_delimiter = '|'
        url = 'http://www.itis.gov/downloads/itisMySQLTables.tar.gz'
        
        # download the taxonomy archive
        filename = self.download_file(url)
        
        # extract the tables
        for extract in ('taxonomic_units', 'longnames', 'synonym_links', 'vernaculars'):
            if os.path.exists(os.path.join(self.data_dir, extract)):
                print 'Using existing copy of %s' % extract
            else:
                print 'Extracting %s from %s...' % (extract, filename)
                archive = tarfile.open(name=filename, mode='r:gz')
                full_extract = [x for x in archive.getnames() if x.split('/')[-1] == extract][0]
                member = archive.getmember(full_extract)
                member.name = extract
                archive.extract(extract, path=self.data_dir)
                archive.close()

        # get names for all ITIS TSNs from longnames table
        print 'Getting names...'
        names = {}
        with open(os.path.join(self.data_dir, 'longnames')) as names_file:
            for line in names_file:
                line = line.strip()
                values = line.split(col_delimiter)
                tax_id, name = values
                names[tax_id] = name
        
        # read all node info from taxonomic_units
        print 'Reading taxonomy...'
        nodes = {}
        with open(os.path.join(self.data_dir, 'taxonomic_units')) as nodes_file:
            for line in nodes_file:
                line = line.strip()
                values = line.split(col_delimiter)
                
                (tax_id, usage, parent_id,
                    uncertain_parent) = [values[n] for n in (0, 10, 17, 23)]
                
                #if uncertain_parent: continue
                if not usage in ('accepted', 'valid'): continue
                
                name = names[tax_id]
                this_node = BaseTree.Clade(name=name)
                nodes[tax_id] = this_node
                this_node.parent_id = parent_id
                
        other_names = defaultdict(set)
        if tree_format == 'cdao':
            # get synonym definitions
            print 'Getting synonyms...'
            with open(os.path.join(self.data_dir, 'synonym_links')) as synonym_file:
                for line in synonym_file:
                    line = line.strip()
                    values = line.split(col_delimiter)
                    node_id, syn_id, _ = values
                    nodes[node_id] = ('synonym', names[node_id], syn_id)
            with open(os.path.join(self.data_dir, 'vernaculars')) as synonym_file:
                for line in synonym_file:
                    line = line.strip()
                    values = line.split(col_delimiter)
                    tax_id, name = values[:2]
                    other_names[tax_id].add(name)
                
        print 'Found %s OTUs.' % len(nodes)
        nodes['0'] = root_node = BaseTree.Clade()
        
        # create tree from nodes dictionary
        print 'Building tree...'
        for node_id, this_node in nodes.iteritems():
            if node_id == '0': continue
            
            if isinstance(this_node, BaseTree.Clade):
                try:
                    parent_node = nodes[this_node.parent_id]
                    parent_node.clades.append(this_node)
            
                except (KeyError, AttributeError): continue
                
                del this_node.parent_id
                
                if not hasattr(this_node, 'tu_attributes'):
                    this_node.tu_attributes = []
                for name in other_names[node_id]:
                    this_node.tu_attributes.append(('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(name)))

            elif this_node[0] == 'synonym':
                _, name, syn_id = this_node
                try:
                    accepted_node = nodes[syn_id]
                except KeyError: continue
                
                if not isinstance(accepted_node, BaseTree.Clade): continue
                
                if not hasattr(accepted_node, 'tu_attributes'):
                    nodes[syn_id].tu_attributes = []
                nodes[syn_id].tu_attributes.append(('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(name)))
                #print 'Synonym: %s -> %s' % (name, nodes[syn_id].name)
        
        tree = BaseTree.Tree(root=root_node)
        
        # write tree to file
        print 'Writing %s tree to %s...' % (tree_format, tree_filename)
        bp.write([tree], tree_filename, tree_format)
        
        print 'Done!'''
Example #5
0
    def main(self, tree_filename, tree_format='newick'):
        col_delimiter = '\t'
        url = 'http://ecat-dev.gbif.org/repository/export/checklist1.zip'
        
        # download the taxonomy archive
        filename = self.download_file(url)
        
        # extract the tables
        extract = 'taxon.txt'
        if os.path.exists(os.path.join(self.data_dir, extract)):
            print 'Using existing copy of %s' % extract
        else:
            print 'Extracting %s from %s...' % (extract, filename)
            archive = zipfile.ZipFile(filename, mode='r')
            archive.extract(extract, path=self.data_dir)
            archive.close()

        # build BioPython clades
        print 'Reading taxonomy...'
        nodes = {}
        with open(os.path.join(self.data_dir, 'taxon.txt')) as taxonomy_file:
            for line in taxonomy_file:
                line = line.strip()
                values = line.split(col_delimiter)
                id, parent_id, syn_id, _, name, _, status = values[:7]
                
                # skip incertae sedis taxa
                if id == '0': continue
                
                if syn_id and not 'synonym' in status:
                    continue
                elif syn_id and 'synonym' in status:
                    if tree_format == 'cdao':
                        nodes[id] = ('synonym', name, syn_id)
                elif not syn_id:
                    nodes[id] = BaseTree.Clade(name=name)
                    nodes[id].parent_id = parent_id
        
        print 'Found %s OTUs.' % len(nodes)
        nodes[''] = root_node = BaseTree.Clade()
        
        # create tree from nodes dictionary
        print 'Building tree...'
        for node_id, this_node in nodes.iteritems():
            if not node_id: continue
            
            if isinstance(this_node, BaseTree.Clade):
                try:
                    parent_node = nodes[this_node.parent_id]
                    parent_node.clades.append(this_node)
                    del this_node.parent_id
                except (KeyError, AttributeError): pass
                
            elif this_node[0] == 'synonym':
                _, name, syn_id = this_node
                try:
                    accepted_node = nodes[syn_id]
                except KeyError: continue
                
                if not isinstance(accepted_node, BaseTree.Clade): continue
                
                if not hasattr(accepted_node, 'tu_attributes'):
                    nodes[syn_id].tu_attributes = []
                nodes[syn_id].tu_attributes.append(('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(name)))
                #print 'Synonym: %s -> %s' % (name, nodes[syn_id].name)
        
        tree = BaseTree.Tree(root=root_node)
        
        # write tree to file
        print 'Writing %s tree to %s...' % (tree_format, tree_filename)
        bp.write([tree], tree_filename, tree_format)
        
        print 'Done!'''
Example #6
0
    def main(self, tree_filename, tree_format='newick', ids=None):
        col_delimiter = '\t|\t'
        row_delimiter = '\t|\n'
        url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'
        
        # download the taxonomy archive
        filename = self.download_file(url)
        
        # extract the text dump
        for extract in ('nodes.dmp', 'names.dmp'):
            if os.path.exists(os.path.join(self.data_dir, extract)):
                print 'Using existing copy of %s' % extract
            else:
                print 'Extracting %s from %s...' % (extract, filename)
                archive = tarfile.open(name=filename, mode='r:gz')
                archive.extract(extract, path=self.data_dir)
                archive.close()
        
        # get names for all tax_ids from names.dmp
        print 'Getting names...'
        scientific_names = {}
        other_names = defaultdict(set)
        with open(os.path.join(self.data_dir, 'names.dmp')) as names_file:
            for line in names_file:
                line = line.rstrip(row_delimiter)
                values = line.split(col_delimiter)
                tax_id, name_txt, _, name_type = values[:4]
                if name_type == 'scientific name':
                    scientific_names[tax_id] = name_txt
                else:
                    other_names[tax_id].add(name_txt)
        
        # read all node info from nodes.dmp
        print 'Reading taxonomy...'
        nodes = {}
        with open(os.path.join(self.data_dir, 'nodes.dmp')) as nodes_file:
            for line in nodes_file:
                line = line.rstrip(row_delimiter)
                values = line.split(col_delimiter)
                tax_id, parent_id = values[:2]
                if ids:
                    this_node = BaseTree.Clade(name=tax_id)
                else:
                    this_node = BaseTree.Clade(name=scientific_names[tax_id])
                
                nodes[tax_id] = this_node
                this_node.parent_id = parent_id

                if tree_format == 'cdao':
                    # add common names, synonyms, mispellings, etc. as skos:altLabels
                    if not hasattr(this_node, 'tu_attributes'):
                        this_node.tu_attributes = []
                    for x in other_names[tax_id]:
                        this_node.tu_attributes.append(('<http://www.w3.org/2004/02/skos/core#altLabel>', Taxonomy.format_rdf_string(x)))

        
        print 'Found %s OTUs.' % len(nodes)
        
        # create tree from nodes dictionary
        print 'Building tree...'
        for node_id, this_node in nodes.iteritems():
            if node_id == this_node.parent_id:
                root_node = this_node
                print 'Found root.'
            else:
                parent_node = nodes[this_node.parent_id]
                parent_node.clades.append(this_node)
                
            del this_node.parent_id
        
        tree = BaseTree.Tree(root=root_node)
        
        # write tree to file
        print 'Writing %s tree to %s...' % (tree_format, tree_filename)
        bp.write([tree], tree_filename, tree_format)
        
        print 'Done!'