Ejemplo n.º 1
0
 def test_input_file(self):
     with tempfile.NamedTemporaryFile(
             prefix='graftm_greengenes_tax_testing') as tf:
         tf.write('seq1\tbacteria;cyanobacteria'.encode())
         tf.flush()
         self.assertEqual({'seq1': ['bacteria','cyanobacteria']},\
                   GreenGenesTaxonomy.read_file(tf.name).taxonomy)
Ejemplo n.º 2
0
    def __init__(self, tree, taxonomy, seqinfo=None):
        '''
        Parameters
        ----------
        tree        : dendropy.Tree

            dendropy.Tree object
        taxonomy    : string
            Path to a file containing taxonomy information about the tree,
            either in Greengenes or taxtastic format (seqinfo file must also
            be provided if taxonomy is in taxtastic format).
        seqinfo     : string
            Path to a seqinfo file. This is a .csv file with the first column
            denoting the sequence name, and the second column, its most resolved
            taxonomic rank.
        '''

        self.encountered_nodes = {}
        self.encountered_taxonomies = set()
        self.tree = tree

        # Read in taxonomy
        logging.info("Reading in taxonomy")
        if seqinfo:
            logging.info("Importing taxtastic taxonomy from files: %s and %s" %
                         (taxonomy, seqinfo))
            gtns = Getaxnseq()
            self.taxonomy = gtns.read_taxtastic_taxonomy_and_seqinfo(
                open(taxonomy), open(seqinfo))

        else:
            try:
                logging.info("Reading Greengenes style taxonomy")
                self.taxonomy = GreenGenesTaxonomy.read_file(taxonomy).taxonomy
            except MalformedGreenGenesTaxonomyException:
                raise Exception("Failed to read taxonomy as a Greengenes \
                                 formatted file. Was a taxtastic style \
                                 taxonomy provided with no seqinfo file?")
Ejemplo n.º 3
0
    def __init__(self, tree, taxonomy, seqinfo=None):
        '''
        Parameters
        ----------
        tree        : dendropy.Tree

            dendropy.Tree object
        taxonomy    : string
            Path to a file containing taxonomy information about the tree,
            either in Greengenes or taxtastic format (seqinfo file must also
            be provided if taxonomy is in taxtastic format).
        seqinfo     : string
            Path to a seqinfo file. This is a .csv file with the first column
            denoting the sequence name, and the second column, its most resolved
            taxonomic rank.
        '''

        self.encountered_nodes = {}
        self.encountered_taxonomies = set()
        self.tree = tree

        # Read in taxonomy
        logging.info("Reading in taxonomy")
        if seqinfo:
            logging.info("Importing taxtastic taxonomy from files: %s and %s" % (taxonomy, seqinfo))
            gtns = Getaxnseq()
            self.taxonomy =  gtns.read_taxtastic_taxonomy_and_seqinfo(open(taxonomy), open(seqinfo))

        else:
            try:
                logging.info("Reading Greengenes style taxonomy")
                self.taxonomy = GreenGenesTaxonomy.read_file(taxonomy).taxonomy
            except MalformedGreenGenesTaxonomyException:
                raise Exception("Failed to read taxonomy as a Greengenes \
                                 formatted file. Was a taxtastic style \
                                 taxonomy provided with no seqinfo file?")
Ejemplo n.º 4
0
 def test_read_semicolon_no_space(self):
     self.assertEqual({'seq1': ['bacteria','cyanobacteria']},\
                       GreenGenesTaxonomy.read(StringIO('seq1\tbacteria;cyanobacteria')).taxonomy)
Ejemplo n.º 5
0
 def test_strip_identifier(self):
     self.assertEqual({'seq1': ['bacteria','cyanobacteria'], 'seq2': ['bacteria','bluebacteria']},\
         GreenGenesTaxonomy.read(StringIO('seq1 \tbacteria;cyanobacteria;\n'\
                                        'seq2\tbacteria;bluebacteria;;\n'
                                        )).taxonomy)
Ejemplo n.º 6
0
 def test_ignores_empty_lines(self):
     self.assertEqual({'seq1': ['bacteria','cyanobacteria'], 'seq2': ['bacteria','bluebacteria']},\
         GreenGenesTaxonomy.read(StringIO('seq1\tbacteria;cyanobacteria;\n'\
                                        'seq2\tbacteria;bluebacteria;;\n'\
                                        '\n'
                                        )).taxonomy)
Ejemplo n.º 7
0
 def test_removes_empties_at_end(self):
     self.assertEqual({'seq1': ['bacteria','cyanobacteria'], 'seq2': ['bacteria','bluebacteria']},\
         GreenGenesTaxonomy.read(StringIO('seq1\tbacteria;cyanobacteria;\n'\
                                        'seq2\tbacteria;bluebacteria;;\n'
                                        )).taxonomy)
Ejemplo n.º 8
0
 def test_raises_when_missing_middle(self):
     with self.assertRaises(MalformedGreenGenesTaxonomyException):
         GreenGenesTaxonomy.read(StringIO('seq1\tbacteria;cyanobacteria\n'\
                                        'seq2\tbacteria;;cyanobacteria\n'
                                        ))
Ejemplo n.º 9
0
 def test_raises_when_duplicate_names(self):
     with self.assertRaises(DuplicateTaxonomyException):
         GreenGenesTaxonomy.read(StringIO('seq1\tbacteria;cyanobacteria\n'\
                                        'seq1\tbacteria;cyanobacteria\n'
                                        ))
Ejemplo n.º 10
0
 def test_ok_when_taxonomy_empty(self):
     self.assertEqual({'seq1': ['bacteria','cyanobacteria'], 'seq2': []},\
         GreenGenesTaxonomy.read(StringIO('seq1\tbacteria;cyanobacteria\n'\
                                        'seq2\t\n'
                                        )).taxonomy)
                        required=True)

    args = parser.parse_args()
    if args.debug:
        loglevel = logging.DEBUG
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.INFO
    logging.basicConfig(level=loglevel,
                        format='%(asctime)s %(levelname)s: %(message)s',
                        datefmt='%m/%d/%Y %I:%M:%S %p')

    # Read in taxonomy
    logging.info("Reading taxonomy..")
    gg = GreenGenesTaxonomy.read(open(args.greengenes_taxonomy)).taxonomy
    logging.info("Read in %i taxonomies" % len(gg))

    # Read in sequence
    logging.info("Reading sequences..")
    duplicates = set()
    sequences = {}
    for name, seq, _ in SequenceIO()._readfq(open(args.sequences)):
        if name in sequences:
            logging.error("Duplicate sequence name %s" % name)
            duplicates.add(name)
        else:
            sequences[name] = seq
    logging.warn("Found %i duplicated IDs" % len(duplicates))
    for dup in duplicates:
        del sequences[dup]
    parser.add_argument('--greengenes_taxonomy', help='tab then semi-colon separated "GreenGenes"-skyle format definition of taxonomies', required=True)
    parser.add_argument('--sequences', help='FASTA file of sequences to be compared', required=True)

    args = parser.parse_args()
    if args.debug:
        loglevel = logging.DEBUG
    elif args.quiet:
        loglevel = logging.ERROR
    else:
        loglevel = logging.INFO
    logging.basicConfig(level=loglevel, format='%(asctime)s %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
    
    
    # Read in taxonomy
    logging.info("Reading taxonomy..")
    gg = GreenGenesTaxonomy.read(open(args.greengenes_taxonomy)).taxonomy
    logging.info("Read in %i taxonomies" % len(gg))
    
    # Read in sequence
    logging.info("Reading sequences..")
    duplicates = set()
    sequences = {}
    for name, seq, _  in SequenceIO()._readfq(open(args.sequences)):
        if name in sequences:
            logging.error("Duplicate sequence name %s" % name)
            duplicates.add(name)
        else:
            sequences[name] = seq
    logging.warn("Found %i duplicated IDs" % len(duplicates))
    for dup in duplicates:
        del sequences[dup]
Ejemplo n.º 13
0
 def test_read_hello_world(self):
     self.assertEqual({'seq1': ['bacteria','cyanobacteria']},\
                       GreenGenesTaxonomy.read(StringIO('seq1\tbacteria; cyanobacteria')).taxonomy)
Ejemplo n.º 14
0
    def update(self, **kwargs):
        '''
        Update an existing GraftM package with new sequences and taxonomy. If no
        taxonomy is provided, attempt to decorate the new sequences with
        pre-existing taxonomy.

        Parameters
        ----------
        input_sequence_path: str
            Path to FASTA file containing sequences to add to the update GraftM
            package
        input_taxonomy_path: str
            Taxonomy corresponding to the sequences in input_sequence_path. If None,
            then attempt to assign taxonomy by decorating the tree made out of all
            sequences.
        input_graftm_package_path: str
            Path to the directory of the GraftM package that is to be updated
        output_graftm_package_path: str
            Path to the directory to which the new GraftM package will be
            written to
        '''
        input_sequence_path = kwargs.pop('input_sequence_path')
        input_taxonomy_path = kwargs.pop('input_taxonomy_path', None)
        input_graftm_package_path = kwargs.pop('input_graftm_package_path')
        output_graftm_package_path = kwargs.pop('output_graftm_package_path')
        threads = kwargs.pop(
            'threads',
            UpdateDefaultOptions.threads)  #TODO: add to user options
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        logging.info("Reading previous GraftM package")
        old_gpkg = GraftMPackage.acquire(input_graftm_package_path)
        min_input_version = 3
        if old_gpkg.version < min_input_version:
            raise InsufficientGraftMPackageVersion(
                "GraftM below version %s cannot be updated using the update function."
                % min_input_version +
                " Unaligned sequences are not included in these packages, therefore no new"
                " alignment/HMM/Tree can be created")

        new_gpkg = UpdatedGraftMPackage()
        new_gpkg.output = output_graftm_package_path
        new_gpkg.name = output_graftm_package_path.replace(".gpkg", "")

        #######################################
        ### Collect all unaligned sequences ###
        logging.info("Concatenating unaligned sequence files")
        new_gpkg.unaligned_sequences = "%s_sequences.fa" % (
            new_gpkg.name
        )  #TODO: replace hard-coded paths like this with tempfiles
        self._concatenate_file(
            [old_gpkg.unaligned_sequence_database_path(), input_sequence_path],
            new_gpkg.unaligned_sequences)

        #########################################################
        ### Parse taxonomy info up front so errors come early ###
        if input_taxonomy_path:
            logging.info("Reading new taxonomy information")
            input_taxonomy = GreenGenesTaxonomy.read_file(input_taxonomy_path)
            original_taxonomy_hash = old_gpkg.taxonomy_hash()
            total_taxonomy_hash = original_taxonomy_hash.copy()
            total_taxonomy_hash.update(input_taxonomy.taxonomy)
            num_duplicate_taxonomies = len(total_taxonomy_hash) - \
                                       len(input_taxonomy.taxonomy) - \
                                       len(original_taxonomy_hash)
            logging.debug(
                "Found %i taxonomic definitions in common between the previous and updated taxonomies"
                % num_duplicate_taxonomies)
            if num_duplicate_taxonomies > 0:
                logging.warn(
                    "Found %i taxonomic definitions in common between the previous and updated taxonomies. Using the updated taxonomy in each case."
                    % num_duplicate_taxonomies)

        ###############################
        ### Re-construct alignments ###
        logging.info("Multiple sequence aligning all sequences")
        new_gpkg.aligned_sequences = "%s_mafft_alignment.fa" % (new_gpkg.name)
        self._align_sequences(new_gpkg.unaligned_sequences,
                              new_gpkg.aligned_sequences, threads)

        ########################
        ### Re-construct HMM ###
        logging.info("Creating HMM from alignment")
        new_gpkg.hmm = "%s.hmm" % (new_gpkg.name)
        new_gpkg.hmm_alignment = "%s_hmm_alignment.fa" % (new_gpkg.name)
        self._get_hmm_from_alignment(new_gpkg.aligned_sequences, new_gpkg.hmm,
                                     new_gpkg.hmm_alignment)

        #########################
        ### Re-construct tree ###
        logging.info("Generating phylogenetic tree")
        new_gpkg.unrooted_tree = "%s.tre" % (new_gpkg.name)
        new_gpkg.unrooted_tree_log = "%s.tre.log" % (new_gpkg.name)
        new_gpkg.package_type, new_gpkg.hmm_length = self._pipe_type(
            old_gpkg.alignment_hmm_path())
        new_gpkg.unrooted_gpkg_tree_log, new_gpkg.unrooted_gpkg_tree = \
            self._build_tree(new_gpkg.hmm_alignment, new_gpkg.name,
                             new_gpkg.package_type, self.fasttree)

        ##############################################
        ### Re-root and decorate tree if necessary ###
        if input_taxonomy_path:
            new_gpkg.gpkg_tree_log = new_gpkg.unrooted_tree_log
            new_gpkg.gpkg_tree = new_gpkg.unrooted_gpkg_tree
        else:
            logging.info("Finding taxonomy for new sequences")
            rerooter = Rerooter()

            old_tree = Tree.get(path=old_gpkg.reference_package_tree_path(),
                                schema='newick')
            new_tree = Tree.get(path=new_gpkg.unrooted_gpkg_tree,
                                schema='newick')
            old_tree = rerooter.reroot(old_tree)
            new_tree = rerooter.reroot(new_tree)
            # TODO: Shouldn't call an underscore method, eventually use
            # Rerooter instead.
            rerooted_tree = rerooter.reroot_by_tree(old_tree, new_tree)
            new_gpkg.gpkg_tree = "%s_gpkg.tree" % new_gpkg.name
            td = TreeDecorator(rerooted_tree,
                               old_gpkg.taxtastic_taxonomy_path(),
                               old_gpkg.taxtastic_seqinfo_path())

            with tempfile.NamedTemporaryFile(suffix='tsv') as taxonomy:
                td.decorate(new_gpkg.gpkg_tree, taxonomy.name, True)
                total_taxonomy_hash = GreenGenesTaxonomy.read_file(
                    taxonomy.name).taxonomy

            ################################
            ### Generating tree log file ###
            logging.info("Generating phylogenetic tree log file")
            new_gpkg.gpkg_tree = "%s_gpkg.tree" % new_gpkg.name
            new_gpkg.gpkg_tree_log = "%s_gpkg.tree.log" % new_gpkg.name
            self._generate_tree_log_file(new_gpkg.unrooted_tree,
                                         new_gpkg.hmm_alignment,
                                         new_gpkg.gpkg_tree,
                                         new_gpkg.gpkg_tree_log,
                                         new_gpkg.package_type, self.fasttree)

        ################################
        ### Creating taxtastic files ###
        logging.info("Writing new taxonomy files")
        new_gpkg.tt_seqinfo = "%s_seqinfo.csv" % new_gpkg.name
        new_gpkg.tt_taxonomy = "%s_taxonomy.csv" % new_gpkg.name
        gtns = Getaxnseq()

        gtns.write_taxonomy_and_seqinfo_files(total_taxonomy_hash,
                                              new_gpkg.tt_taxonomy,
                                              new_gpkg.tt_seqinfo)

        ######################
        ### Compile refpkg ###
        logging.info("Compiling pplacer refpkg")
        new_gpkg.refpkg = "%s.refpkg" % (new_gpkg.name)
        refpkg = self._taxit_create(new_gpkg.name, new_gpkg.hmm_alignment,
                                    new_gpkg.gpkg_tree, new_gpkg.gpkg_tree_log,
                                    new_gpkg.tt_taxonomy, new_gpkg.tt_seqinfo,
                                    new_gpkg.refpkg, True)

        #####################################
        ### Re-construct diamond database ###
        logging.info("Recreating DIAMOND DB")
        new_gpkg.diamond_database = "%s.dmnd" % (new_gpkg.name)
        self._create_dmnd_database(new_gpkg.unaligned_sequences, new_gpkg.name)

        ####################
        ### Compile gpkg ###
        logging.info("Compiling GraftM package")
        new_gpkg.name = "%s.gpkg" % new_gpkg.name
        GraftMPackageVersion3.compile(
            new_gpkg.name,
            new_gpkg.refpkg,
            new_gpkg.hmm,
            new_gpkg.diamond_database,
            self._define_range(new_gpkg.unaligned_sequences),
            new_gpkg.unaligned_sequences,
            search_hmm_files=old_gpkg.search_hmm_paths())

        ###################
        ### Test it out ###
        logging.info("Testing newly updated GraftM package works")
        self._test_package(new_gpkg.name)

        logging.info("Finished")
Ejemplo n.º 15
0
    def run(self, **kwargs):
        forward_read_files = kwargs.pop('sequences')
        output_otu_table = kwargs.pop('otu_table', None)
        archive_otu_table = kwargs.pop('archive_otu_table', None)
        num_threads = kwargs.pop('threads')
        known_otu_tables = kwargs.pop('known_otu_tables')
        singlem_assignment_method = kwargs.pop('assignment_method')
        output_jplace = kwargs.pop('output_jplace')
        output_extras = kwargs.pop('output_extras')
        evalue = kwargs.pop('evalue')
        min_orf_length = kwargs.pop('min_orf_length')
        restrict_read_length = kwargs.pop('restrict_read_length')
        filter_minimum_protein = kwargs.pop('filter_minimum_protein')
        filter_minimum_nucleotide = kwargs.pop('filter_minimum_nucleotide')
        include_inserts = kwargs.pop('include_inserts')
        singlem_packages = kwargs.pop('singlem_packages')
        window_size = kwargs.pop('window_size')
        assign_taxonomy = kwargs.pop('assign_taxonomy')
        known_sequence_taxonomy = kwargs.pop('known_sequence_taxonomy')

        working_directory = kwargs.pop('working_directory')
        force = kwargs.pop('force')
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)
        
        self._num_threads = num_threads
        self._evalue = evalue
        self._min_orf_length = min_orf_length
        self._restrict_read_length = restrict_read_length
        self._filter_minimum_protein = filter_minimum_protein
        self._filter_minimum_nucleotide = filter_minimum_nucleotide

        hmms = HmmDatabase(singlem_packages)
        if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD:
            graftm_assignment_method = DIAMOND_ASSIGNMENT_METHOD
        else:
            graftm_assignment_method = singlem_assignment_method
            
        if logging.getLevelName(logging.getLogger().level) == 'DEBUG':
            self._graftm_verbosity = '5'
        else:
            self._graftm_verbosity = '2'

        using_temporary_working_directory = working_directory is None
        if using_temporary_working_directory:
            shared_mem_directory = '/dev/shm'
            if os.path.exists(shared_mem_directory):
                logging.debug("Using shared memory as a base directory")
                tmp = tempdir.TempDir(basedir=shared_mem_directory)
                tempfiles_path = os.path.join(tmp.name, 'tempfiles')
                os.mkdir(tempfiles_path)
                os.environ['TEMP'] = tempfiles_path
            else:
                logging.debug("Shared memory directory not detected, using default temporary directory instead")
                tmp = tempdir.TempDir()
            working_directory = tmp.name
        else:
            working_directory = working_directory
            if os.path.exists(working_directory):
                if force:
                    logging.info("Overwriting directory %s" % working_directory)
                    shutil.rmtree(working_directory)
                    os.mkdir(working_directory)
                else:
                    raise Exception("Working directory '%s' already exists, not continuing" % working_directory)
            else:
                os.mkdir(working_directory)
        logging.debug("Using working directory %s" % working_directory)
        self._working_directory = working_directory

        extracted_reads = None
        def return_cleanly():
            if extracted_reads: extracted_reads.cleanup()
            if using_temporary_working_directory: tmp.dissolve()
            logging.info("Finished")

        #### Search
        self._singlem_package_database = hmms
        search_result = self._search(hmms, forward_read_files)
        sample_names = search_result.samples_with_hits()
        if len(sample_names) == 0:
            logging.info("No reads identified in any samples, stopping")
            return_cleanly()
            return
        logging.debug("Recovered %i samples with at least one hit e.g. '%s'" \
                     % (len(sample_names), sample_names[0]))

        #### Alignment
        align_result = self._align(search_result)

        ### Extract reads that have already known taxonomy
        if known_otu_tables:
            logging.info("Parsing known taxonomy OTU tables")
            known_taxes = KnownOtuTable()
            known_taxes.parse_otu_tables(known_otu_tables)
            logging.debug("Read in %i sequences with known taxonomy" % len(known_taxes))
        else:
            known_taxes = []
        if known_sequence_taxonomy:
            logging.debug("Parsing sequence-wise taxonomy..")
            tax1 = GreenGenesTaxonomy.read(open(known_sequence_taxonomy)).taxonomy
            known_sequence_tax = {}
            for seq_id, tax in tax1.items():
                known_sequence_tax[seq_id] = '; '.join(tax)
            logging.info("Read in %i taxonomies from the GreenGenes format taxonomy file" % len(known_sequence_tax))

        ### Extract other reads which do not have known taxonomy
        extracted_reads = self._extract_relevant_reads(
            align_result, include_inserts, known_taxes)
        logging.info("Finished extracting aligned sequences")

        #### Taxonomic assignment
        if assign_taxonomy:
            logging.info("Running taxonomic assignment with graftm..")
            assignment_result = self._assign_taxonomy(
                extracted_reads, graftm_assignment_method)

        #### Process taxonomically assigned reads
        # get the sequences out for each of them
        otu_table_object = OtuTable()
        regular_output_fields = split('gene sample sequence num_hits coverage taxonomy')
        otu_table_object.fields = regular_output_fields + \
                                  split('read_names nucleotides_aligned taxonomy_by_known?')

        for sample_name, singlem_package, tmp_graft, known_sequences, unknown_sequences in extracted_reads:
            def add_info(infos, otu_table_object, known_tax):
                for info in infos:
                    to_print = [
                        singlem_package.graftm_package_basename(),
                        sample_name,
                        info.seq,
                        info.count,
                        info.coverage,
                        info.taxonomy,
                        info.names,
                        info.aligned_lengths,
                        known_tax]
                    otu_table_object.data.append(to_print)
            known_infos = self._seqs_to_counts_and_taxonomy(
                known_sequences,
                known_taxes,
                False,
                True)
            add_info(known_infos, otu_table_object, True)
            
            if tmp_graft: # if any sequences were aligned (not just already known)
                tmpbase = os.path.basename(tmp_graft.name[:-6])#remove .fasta
                
                if assign_taxonomy:
                    is_known_taxonomy = False
                    aligned_seqs = self._get_windowed_sequences(
                        assignment_result.prealigned_sequence_file(
                            sample_name, singlem_package, tmpbase),
                        assignment_result.nucleotide_hits_file(
                            sample_name, singlem_package, tmpbase),
                        singlem_package,
                        include_inserts)
                    if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD:
                        tax_file = assignment_result.diamond_assignment_file(
                            sample_name, singlem_package, tmpbase)
                    else:
                        tax_file = assignment_result.read_tax_file(
                            sample_name, singlem_package, tmpbase)
                    logging.debug("Reading taxonomy from %s" % tax_file)

                    if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD:
                        taxonomies = DiamondResultParser(tax_file)
                        use_first = True
                    else:
                        if not os.path.isfile(tax_file):
                            logging.warn("Unable to find tax file for gene %s from sample %s "
                                         "(likely do to min length filtering), skipping" % (
                                             os.path.basename(singlem_package.base_directory()),
                                             sample_name))
                            taxonomies = {}
                        else:
                            taxonomies = TaxonomyFile(tax_file)
                        use_first = False
                        
                else: # Taxonomy has not been assigned.
                    aligned_seqs = unknown_sequences
                    if known_sequence_taxonomy:
                        taxonomies = known_sequence_tax
                    else:
                        taxonomies = {}
                    use_first = False # irrelevant
                    is_known_taxonomy = True
                    
                new_infos = list(self._seqs_to_counts_and_taxonomy(
                    aligned_seqs, taxonomies, use_first, False))
                add_info(new_infos, otu_table_object, is_known_taxonomy)
                
                if output_jplace:
                    base_dir = assignment_result._base_dir(
                        sample_name, singlem_package, tmpbase)
                    input_jplace_file = os.path.join(base_dir, "placements.jplace")
                    output_jplace_file = os.path.join(base_dir, "%s_%s_%s.jplace" % (
                        output_jplace, sample_name, singlem_package.graftm_package_basename()))
                    logging.debug("Converting jplace file %s to singlem jplace file %s" % (
                        input_jplace_file, output_jplace_file))
                    with open(output_jplace_file, 'w') as output_jplace_io:
                        self._write_jplace_from_infos(
                            open(input_jplace_file), new_infos, output_jplace_io)

                            
        if output_otu_table:
            with open(output_otu_table, 'w') as f:
                if output_extras:
                    otu_table_object.write_to(f, otu_table_object.fields)
                else:
                    otu_table_object.write_to(f, regular_output_fields)
        if archive_otu_table:
            with open(archive_otu_table, 'w') as f:
                otu_table_object.archive(hmms.singlem_packages).write_to(f)
        return_cleanly()
Ejemplo n.º 16
0
    def main(self, **kwargs):
        alignment = kwargs.pop('alignment',None)
        sequences = kwargs.pop('sequences',None)
        taxonomy = kwargs.pop('taxonomy',None)
        rerooted_tree = kwargs.pop('rerooted_tree',None)
        unrooted_tree = kwargs.pop('unrooted_tree',None)
        tree_log = kwargs.pop('tree_log', None)
        prefix = kwargs.pop('prefix', None)
        rerooted_annotated_tree = kwargs.pop('rerooted_annotated_tree', None)
        user_hmm = kwargs.pop('hmm', None)
        search_hmm_files = kwargs.pop('search_hmm_files',None)
        min_aligned_percent = kwargs.pop('min_aligned_percent',0.01)
        taxtastic_taxonomy = kwargs.pop('taxtastic_taxonomy', None)
        taxtastic_seqinfo = kwargs.pop('taxtastic_seqinfo', None)
        force_overwrite = kwargs.pop('force',False)
        graftm_package = kwargs.pop('graftm_package',False)
        dereplication_level = kwargs.pop('dereplication_level',False)
        threads = kwargs.pop('threads',5)

        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)
        seqio = SequenceIO()
        locus_name = (os.path.basename(sequences).split('.')[0]
                      if sequences
                      else os.path.basename(alignment).split('.')[0])
        tmp = tempdir.TempDir()
        base = os.path.join(tmp.name, locus_name)
        insufficiently_aligned_sequences = [None]
        removed_sequence_names = []
        tempfiles_to_close = []

        if prefix:
            output_gpkg_path = prefix
        else:
            output_gpkg_path = "%s.gpkg" % locus_name

        if os.path.exists(output_gpkg_path):
            if force_overwrite:
                logging.warn("Deleting previous directory %s" % output_gpkg_path)
                shutil.rmtree(output_gpkg_path)
            else:
                raise Exception("Cowardly refusing to overwrite gpkg to already existing %s" % output_gpkg_path)
        logging.info("Building gpkg for %s" % output_gpkg_path)

        # Read in taxonomy somehow
        gtns = Getaxnseq()
        if rerooted_annotated_tree:
            logging.info("Building seqinfo and taxonomy file from input annotated tree")
            taxonomy_definition = TaxonomyExtractor().taxonomy_from_annotated_tree(\
                    Tree.get(path=rerooted_annotated_tree, schema='newick'))
        elif taxonomy:
            logging.info("Building seqinfo and taxonomy file from input taxonomy")
            taxonomy_definition = GreenGenesTaxonomy.read_file(taxonomy).taxonomy
        elif taxtastic_seqinfo and taxtastic_taxonomy:
            logging.info("Reading taxonomy from taxtastic taxonomy and seqinfo files")
            taxonomy_definition = gtns.read_taxtastic_taxonomy_and_seqinfo\
                (open(taxtastic_taxonomy),
                 open(taxtastic_seqinfo))
        else:
            raise Exception("Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree")

        # Check for duplicates
        logging.info("Checking for duplicate sequences")
        dup = self._check_for_duplicate_sequence_names(sequences)
        if dup:
            raise Exception("Found duplicate sequence name '%s' in sequences input file" % dup)
        output_alignment_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.aln.faa')
        tempfiles_to_close.append(output_alignment_fh)
        output_alignment = output_alignment_fh.name
        if user_hmm:
            align_hmm = user_hmm
        else:
            align_hmm_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='_align.hmm')
            tempfiles_to_close.append(align_hmm_fh)
            align_hmm = align_hmm_fh.name

        if alignment:
            dup = self._check_for_duplicate_sequence_names(alignment)
            if dup:
                raise Exception("Found duplicate sequence name '%s' in alignment input file" % dup)
            ptype = self._get_hmm_from_alignment(alignment,
                                                 align_hmm,
                                                 output_alignment)
        else:
            logging.info("Aligning sequences to create aligned FASTA file")
            ptype, output_alignment = self._align_and_create_hmm(sequences, alignment, user_hmm,
                                               align_hmm, output_alignment, threads)

        logging.info("Checking for incorrect or fragmented reads")
        insufficiently_aligned_sequences = self._check_reads_hit(open(output_alignment),
                                                                 min_aligned_percent)
        while len(insufficiently_aligned_sequences) > 0:
            logging.warn("One or more alignments do not span > %.2f %% of HMM" % (min_aligned_percent*100))
            for s in insufficiently_aligned_sequences:
                logging.warn("Insufficient alignment of %s, not including this sequence" % s)

            sequences2_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.faa')
            tempfiles_to_close.append(sequences2_fh)
            sequences2 = sequences2_fh.name
            num_sequences = self._remove_sequences_from_alignment(insufficiently_aligned_sequences,
                                                                  sequences,
                                                                  sequences2)
            sequences = sequences2

            if alignment:
                alignment2_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.aln.faa')
                tempfiles_to_close.append(alignment2_fh)
                alignment2 = alignment2_fh.name
                num_sequences = self._remove_sequences_from_alignment(insufficiently_aligned_sequences,
                                                                      alignment,
                                                                      alignment2)
                alignment = alignment2
                for name in insufficiently_aligned_sequences:
                    if rerooted_tree or rerooted_annotated_tree:
                        logging.warning('''Sequence %s in provided alignment does not meet the --min_aligned_percent cutoff. This sequence will be removed from the tree
in the final GraftM package. If you are sure these sequences are correct, turn off the --min_aligned_percent cutoff, provide it with a 0 (e.g. --min_aligned_percent 0) ''' % name)
                    removed_sequence_names.append(name)


            logging.info("After removing %i insufficiently aligned sequences, left with %i sequences" % (len(insufficiently_aligned_sequences), num_sequences))
            if num_sequences < 4:
                raise Exception("Too few sequences remaining in alignment after removing insufficiently aligned sequences: %i" % num_sequences)
            else:
                logging.info("Reconstructing the alignment and HMM from remaining sequences")
                output_alignment_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.aln.faa')
                tempfiles_to_close.append(output_alignment_fh)
                output_alignment = output_alignment_fh.name
                if not user_hmm:
                    align_hmm_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.hmm')
                    tempfiles_to_close.append(align_hmm_fh)
                    align_hmm = align_hmm_fh.name
                ptype, output_alignment= self._align_and_create_hmm(sequences, alignment, user_hmm,
                                                   align_hmm, output_alignment, threads)
                logging.info("Checking for incorrect or fragmented reads")
                insufficiently_aligned_sequences = self._check_reads_hit(open(output_alignment),
                                                                         min_aligned_percent)
        if not search_hmm_files:
            search_hmm_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='_search.hmm')
            tempfiles_to_close.append(search_hmm_fh)
            search_hmm = search_hmm_fh.name
            self._create_search_hmm(sequences, taxonomy_definition, search_hmm, dereplication_level, threads)
            search_hmm_files = [search_hmm]

        # Make sure each sequence has been assigned a taxonomy:
        aligned_sequence_objects = seqio.read_fasta_file(output_alignment)
        unannotated = []
        for s in aligned_sequence_objects:
            if s.name not in taxonomy_definition:
                unannotated.append(s.name)
        if len(unannotated) > 0:
            for s in unannotated:
                logging.error("Unable to find sequence '%s' in the taxonomy definition" % s)
            raise Exception("All sequences must be assigned a taxonomy, cannot continue")


        logging.debug("Looking for non-standard characters in aligned sequences")
        self._mask_strange_sequence_letters(aligned_sequence_objects, ptype)

        # Deduplicate sequences - pplacer cannot handle these
        logging.info("Deduplicating sequences")
        dedup = Deduplicator()
        deduplicated_arrays = dedup.deduplicate(aligned_sequence_objects)
        deduplicated_taxonomy = dedup.lca_taxonomy(deduplicated_arrays, taxonomy_definition)
        deduplicated_taxonomy_hash = {}
        for i, tax in enumerate(deduplicated_taxonomy):
            deduplicated_taxonomy_hash[deduplicated_arrays[i][0].name] = tax
        deduplicated_alignment_file = base+"_deduplicated_aligned.fasta"
        seqio.write_fasta_file([seqs[0] for seqs in deduplicated_arrays],
                               deduplicated_alignment_file)

        logging.info("Removed %i sequences as duplicates, leaving %i non-identical sequences"\
                     % ((len(aligned_sequence_objects)-len(deduplicated_arrays)),
                        len(deduplicated_arrays)))

        # Get corresponding unaligned sequences
        filtered_names=[]
        for list in [x for x in [x[1:] for x in deduplicated_arrays] if x]:
            for seq in list:
                filtered_names.append(seq.name)
        sequences2_fh = tempfile.NamedTemporaryFile(prefix='graftm', suffix='.faa')
        tempfiles_to_close.append(sequences2_fh)
        sequences2 = sequences2_fh.name


        # Create tree unless one was provided
        if not rerooted_tree and not rerooted_annotated_tree and not unrooted_tree:
            logging.debug("No tree provided")
            logging.info("Building tree")
            log_file, tre_file = self._build_tree(deduplicated_alignment_file,
                                                  base, ptype,
                                                  self.fasttree)
            no_reroot = False
        else:
            if rerooted_tree:
                logging.debug("Found unannotated pre-rerooted tree file %s" % rerooted_tree)
                tre_file=rerooted_tree
                no_reroot = True
            elif rerooted_annotated_tree:
                logging.debug("Found annotated pre-rerooted tree file %s" % rerooted_tree)
                tre_file=rerooted_annotated_tree
                no_reroot = True
            elif unrooted_tree:
                logging.info("Using input unrooted tree")
                tre_file = unrooted_tree
                no_reroot = False
            else:
                raise


            # Remove any sequences from the tree that are duplicates
            cleaner = DendropyTreeCleaner()
            tree = Tree.get(path=tre_file, schema='newick')
            for group in deduplicated_arrays:
                [removed_sequence_names.append(s.name) for s in group[1:]]
            cleaner.remove_sequences(tree, removed_sequence_names)

            # Ensure there is nothing amiss now as a user-interface thing
            cleaner.match_alignment_and_tree_sequence_ids(\
                [g[0].name for g in deduplicated_arrays], tree)

            if tree_log:
                # User specified a log file, go with that
                logging.debug("Using user-specified log file %s" % tree_log)
                log_file = tree_log
            else:
                logging.info("Generating log file")
                log_file_tempfile = tempfile.NamedTemporaryFile(suffix='.tree_log', prefix='graftm')
                tempfiles_to_close.append(log_file_tempfile)
                log_file = log_file_tempfile.name
                tre_file_tempfile = tempfile.NamedTemporaryFile(suffix='.tree', prefix='graftm')
                tempfiles_to_close.append(tre_file_tempfile)
                tre_file = tre_file_tempfile.name
                with tempfile.NamedTemporaryFile(suffix='.tree', prefix='graftm') as f:
                    # Make the newick file simple (ie. un-arb it) for fasttree.
                    cleaner.write_fasttree_newick(tree, f)
                    f.flush()
                    self._generate_tree_log_file(f.name, deduplicated_alignment_file,
                                                 tre_file, log_file, ptype, self.fasttree)

        # Create tax and seqinfo .csv files
        taxonomy_to_keep=[
                          seq.name for seq in
                                [x for x in [x[0] for x in deduplicated_arrays]
                                 if x]
                          ]
        refpkg = "%s.refpkg" % output_gpkg_path
        self.the_trash.append(refpkg)
        if taxtastic_taxonomy and taxtastic_seqinfo:
            logging.info("Creating reference package")
            refpkg = self._taxit_create(base, deduplicated_alignment_file,
                                        tre_file, log_file, taxtastic_taxonomy,
                                        taxtastic_seqinfo, refpkg, no_reroot)
        else:
            gtns = Getaxnseq()
            seq = base+"_seqinfo.csv"
            tax = base+"_taxonomy.csv"
            self.the_trash += [seq, tax]
            if rerooted_annotated_tree:
                logging.info("Building seqinfo and taxonomy file from input annotated tree")
                taxonomy_definition = TaxonomyExtractor().taxonomy_from_annotated_tree(
                    Tree.get(path=rerooted_annotated_tree, schema='newick'))
            elif taxonomy:
                logging.info("Building seqinfo and taxonomy file from input taxonomy")
                taxonomy_definition = GreenGenesTaxonomy.read_file(taxonomy).taxonomy
            else:
                raise Exception("Programming error: Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree")

            taxonomy_definition = {x:taxonomy_definition[x]
                                   for x in taxonomy_definition
                                   if x in taxonomy_to_keep}

            gtns.write_taxonomy_and_seqinfo_files(taxonomy_definition,
                                                  tax,
                                                  seq)

            # Create the reference package
            logging.info("Creating reference package")
            refpkg = self._taxit_create(base, deduplicated_alignment_file,
                                        tre_file, log_file, tax, seq, refpkg,
                                        no_reroot)
        if sequences:
            # Run diamond makedb
            logging.info("Creating diamond database")
            if ptype == Create._PROTEIN_PACKAGE_TYPE:
                cmd = "diamond makedb --in '%s' -d '%s'" % (sequences, base)
                extern.run(cmd)
                diamondb = '%s.dmnd' % base
            elif ptype == Create._NUCLEOTIDE_PACKAGE_TYPE:
                diamondb = None
            else: raise Exception("Programming error")
        else:
            diamondb = None

        if sequences:
            # Get range
            max_range = self._define_range(sequences)
        else:
            max_range = self._define_range(alignment)

        # Compile the gpkg
        logging.info("Compiling gpkg")

        GraftMPackageVersion3.compile(output_gpkg_path, refpkg, align_hmm, diamondb,
                                      max_range, sequences, search_hmm_files=search_hmm_files)

        logging.info("Cleaning up")
        self._cleanup(self.the_trash)
        for tf in tempfiles_to_close:
            tf.close()

        # Test out the gpkg just to be sure.
        #
        # TODO: Use graftM through internal means rather than via extern. This
        # requires some refactoring so that graft() can be called easily with
        # sane defaults.
        logging.info("Testing gpkg package works")
        self._test_package(output_gpkg_path)

        logging.info("Finished\n")
Ejemplo n.º 17
0
    def main(self, **kwargs):
        alignment = kwargs.pop('alignment', None)
        sequences = kwargs.pop('sequences', None)
        taxonomy = kwargs.pop('taxonomy', None)
        rerooted_tree = kwargs.pop('rerooted_tree', None)
        unrooted_tree = kwargs.pop('unrooted_tree', None)
        tree_log = kwargs.pop('tree_log', None)
        prefix = kwargs.pop('prefix', None)
        rerooted_annotated_tree = kwargs.pop('rerooted_annotated_tree', None)
        user_hmm = kwargs.pop('hmm', None)
        search_hmm_files = kwargs.pop('search_hmm_files', None)
        min_aligned_percent = kwargs.pop('min_aligned_percent', 0.01)
        taxtastic_taxonomy = kwargs.pop('taxtastic_taxonomy', None)
        taxtastic_seqinfo = kwargs.pop('taxtastic_seqinfo', None)
        force_overwrite = kwargs.pop('force', False)
        graftm_package = kwargs.pop('graftm_package', False)
        dereplication_level = kwargs.pop('dereplication_level', False)
        threads = kwargs.pop('threads', 5)

        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)
        seqio = SequenceIO()
        locus_name = (os.path.basename(sequences).split('.')[0] if sequences
                      else os.path.basename(alignment).split('.')[0])
        tmp = tempdir.TempDir()
        base = os.path.join(tmp.name, locus_name)
        insufficiently_aligned_sequences = [None]
        removed_sequence_names = []

        if prefix:
            output_gpkg_path = prefix
        else:
            output_gpkg_path = "%s.gpkg" % locus_name

        if os.path.exists(output_gpkg_path):
            if force_overwrite:
                logging.warn("Deleting previous directory %s" %
                             output_gpkg_path)
                shutil.rmtree(output_gpkg_path)
            else:
                raise Exception(
                    "Cowardly refusing to overwrite gpkg to already existing %s"
                    % output_gpkg_path)
        logging.info("Building gpkg for %s" % output_gpkg_path)

        # Read in taxonomy somehow
        gtns = Getaxnseq()
        if rerooted_annotated_tree:
            logging.info(
                "Building seqinfo and taxonomy file from input annotated tree")
            taxonomy_definition = TaxonomyExtractor().taxonomy_from_annotated_tree(\
                    Tree.get(path=rerooted_annotated_tree, schema='newick'))
        elif taxonomy:
            logging.info(
                "Building seqinfo and taxonomy file from input taxonomy")
            taxonomy_definition = GreenGenesTaxonomy.read_file(
                taxonomy).taxonomy
        elif taxtastic_seqinfo and taxtastic_taxonomy:
            logging.info(
                "Reading taxonomy from taxtastic taxonomy and seqinfo files")
            taxonomy_definition = gtns.read_taxtastic_taxonomy_and_seqinfo\
                (open(taxtastic_taxonomy),
                 open(taxtastic_seqinfo))
        else:
            raise Exception(
                "Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree"
            )

        # Check for duplicates
        logging.info("Checking for duplicate sequences")
        dup = self._check_for_duplicate_sequence_names(sequences)
        if dup:
            raise Exception(
                "Found duplicate sequence name '%s' in sequences input file" %
                dup)
        output_alignment = tempfile.NamedTemporaryFile(prefix='graftm',
                                                       suffix='.aln.faa').name
        align_hmm = (user_hmm if user_hmm else tempfile.NamedTemporaryFile(
            prefix='graftm', suffix='_align.hmm').name)

        if alignment:
            dup = self._check_for_duplicate_sequence_names(alignment)
            if dup:
                raise Exception(
                    "Found duplicate sequence name '%s' in alignment input file"
                    % dup)
            ptype = self._get_hmm_from_alignment(alignment, align_hmm,
                                                 output_alignment)
        else:
            logging.info("Aligning sequences to create aligned FASTA file")
            ptype, output_alignment = self._align_and_create_hmm(
                sequences, alignment, user_hmm, align_hmm, output_alignment,
                threads)

        logging.info("Checking for incorrect or fragmented reads")
        insufficiently_aligned_sequences = self._check_reads_hit(
            open(output_alignment), min_aligned_percent)
        while len(insufficiently_aligned_sequences) > 0:
            logging.warn(
                "One or more alignments do not span > %.2f %% of HMM" %
                (min_aligned_percent * 100))
            for s in insufficiently_aligned_sequences:
                logging.warn(
                    "Insufficient alignment of %s, not including this sequence"
                    % s)

            _, sequences2 = tempfile.mkstemp(prefix='graftm', suffix='.faa')
            num_sequences = self._remove_sequences_from_alignment(
                insufficiently_aligned_sequences, sequences, sequences2)
            sequences = sequences2

            if alignment:
                _, alignment2 = tempfile.mkstemp(prefix='graftm',
                                                 suffix='.aln.faa')
                num_sequences = self._remove_sequences_from_alignment(
                    insufficiently_aligned_sequences, alignment, alignment2)
                alignment = alignment2
                for name in insufficiently_aligned_sequences:
                    if rerooted_tree or rerooted_annotated_tree:
                        logging.warning(
                            '''Sequence %s in provided alignment does not meet the --min_aligned_percent cutoff. This sequence will be removed from the tree
in the final GraftM package. If you are sure these sequences are correct, turn off the --min_aligned_percent cutoff, provide it with a 0 (e.g. --min_aligned_percent 0) '''
                            % name)
                    removed_sequence_names.append(name)

            logging.info(
                "After removing %i insufficiently aligned sequences, left with %i sequences"
                % (len(insufficiently_aligned_sequences), num_sequences))
            if num_sequences < 4:
                raise Exception(
                    "Too few sequences remaining in alignment after removing insufficiently aligned sequences: %i"
                    % num_sequences)
            else:
                logging.info(
                    "Reconstructing the alignment and HMM from remaining sequences"
                )
                output_alignment = tempfile.NamedTemporaryFile(
                    prefix='graftm', suffix='.aln.faa').name
                if not user_hmm:
                    align_hmm = tempfile.NamedTemporaryFile(prefix='graftm',
                                                            suffix='.hmm').name
                ptype, output_alignment = self._align_and_create_hmm(
                    sequences, alignment, user_hmm, align_hmm,
                    output_alignment, threads)
                logging.info("Checking for incorrect or fragmented reads")
                insufficiently_aligned_sequences = self._check_reads_hit(
                    open(output_alignment), min_aligned_percent)
        if not search_hmm_files:
            search_hmm = tempfile.NamedTemporaryFile(prefix='graftm',
                                                     suffix='_search.hmm').name
            self._create_search_hmm(sequences, taxonomy_definition, search_hmm,
                                    dereplication_level, threads)
            search_hmm_files = [search_hmm]

        # Make sure each sequence has been assigned a taxonomy:
        aligned_sequence_objects = seqio.read_fasta_file(output_alignment)
        unannotated = []
        for s in aligned_sequence_objects:
            if s.name not in taxonomy_definition:
                unannotated.append(s.name)
        if len(unannotated) > 0:
            for s in unannotated:
                logging.error(
                    "Unable to find sequence '%s' in the taxonomy definition" %
                    s)
            raise Exception(
                "All sequences must be assigned a taxonomy, cannot continue")

        logging.debug(
            "Looking for non-standard characters in aligned sequences")
        self._mask_strange_sequence_letters(aligned_sequence_objects, ptype)

        # Deduplicate sequences - pplacer cannot handle these
        logging.info("Deduplicating sequences")
        dedup = Deduplicator()
        deduplicated_arrays = dedup.deduplicate(aligned_sequence_objects)
        deduplicated_taxonomy = dedup.lca_taxonomy(deduplicated_arrays,
                                                   taxonomy_definition)
        deduplicated_taxonomy_hash = {}
        for i, tax in enumerate(deduplicated_taxonomy):
            deduplicated_taxonomy_hash[deduplicated_arrays[i][0].name] = tax
        deduplicated_alignment_file = base + "_deduplicated_aligned.fasta"
        seqio.write_fasta_file([seqs[0] for seqs in deduplicated_arrays],
                               deduplicated_alignment_file)

        logging.info("Removed %i sequences as duplicates, leaving %i non-identical sequences"\
                     % ((len(aligned_sequence_objects)-len(deduplicated_arrays)),
                        len(deduplicated_arrays)))

        # Get corresponding unaligned sequences
        filtered_names = []
        for list in [x for x in [x[1:] for x in deduplicated_arrays] if x]:
            for seq in list:
                filtered_names.append(seq.name)
        _, sequences2 = tempfile.mkstemp(prefix='graftm', suffix='.faa')

        # Create tree unless one was provided
        if not rerooted_tree and not rerooted_annotated_tree and not unrooted_tree:
            logging.debug("No tree provided")
            logging.info("Building tree")
            log_file, tre_file = self._build_tree(deduplicated_alignment_file,
                                                  base, ptype, self.fasttree)
            no_reroot = False
        else:
            if rerooted_tree:
                logging.debug("Found unannotated pre-rerooted tree file %s" %
                              rerooted_tree)
                tre_file = rerooted_tree
                no_reroot = True
            elif rerooted_annotated_tree:
                logging.debug("Found annotated pre-rerooted tree file %s" %
                              rerooted_tree)
                tre_file = rerooted_annotated_tree
                no_reroot = True
            elif unrooted_tree:
                logging.info("Using input unrooted tree")
                tre_file = unrooted_tree
                no_reroot = False
            else:
                raise

            # Remove any sequences from the tree that are duplicates
            cleaner = DendropyTreeCleaner()
            tree = Tree.get(path=tre_file, schema='newick')
            for group in deduplicated_arrays:
                [removed_sequence_names.append(s.name) for s in group[1:]]
            cleaner.remove_sequences(tree, removed_sequence_names)

            # Ensure there is nothing amiss now as a user-interface thing
            cleaner.match_alignment_and_tree_sequence_ids(\
                [g[0].name for g in deduplicated_arrays], tree)

            if tree_log:
                # User specified a log file, go with that
                logging.debug("Using user-specified log file %s" % tree_log)
                log_file = tree_log
            else:
                logging.info("Generating log file")
                log_file_tempfile = tempfile.NamedTemporaryFile(
                    suffix='.tree_log', prefix='graftm')
                log_file = log_file_tempfile.name
                tre_file_tempfile = tempfile.NamedTemporaryFile(
                    suffix='.tree', prefix='graftm')
                tre_file = tre_file_tempfile.name
                with tempfile.NamedTemporaryFile(suffix='.tree',
                                                 prefix='graftm') as f:
                    # Make the newick file simple (ie. un-arb it) for fasttree.
                    cleaner.write_fasttree_newick(tree, f)
                    f.flush()
                    self._generate_tree_log_file(f.name,
                                                 deduplicated_alignment_file,
                                                 tre_file, log_file, ptype,
                                                 self.fasttree)

        # Create tax and seqinfo .csv files
        taxonomy_to_keep = [
            seq.name
            for seq in [x for x in [x[0] for x in deduplicated_arrays] if x]
        ]
        refpkg = "%s.refpkg" % output_gpkg_path
        self.the_trash.append(refpkg)
        if taxtastic_taxonomy and taxtastic_seqinfo:
            logging.info("Creating reference package")
            refpkg = self._taxit_create(base, deduplicated_alignment_file,
                                        tre_file, log_file, taxtastic_taxonomy,
                                        taxtastic_seqinfo, refpkg, no_reroot)
        else:
            gtns = Getaxnseq()
            seq = base + "_seqinfo.csv"
            tax = base + "_taxonomy.csv"
            self.the_trash += [seq, tax]
            if rerooted_annotated_tree:
                logging.info(
                    "Building seqinfo and taxonomy file from input annotated tree"
                )
                taxonomy_definition = TaxonomyExtractor(
                ).taxonomy_from_annotated_tree(
                    Tree.get(path=rerooted_annotated_tree, schema='newick'))
            elif taxonomy:
                logging.info(
                    "Building seqinfo and taxonomy file from input taxonomy")
                taxonomy_definition = GreenGenesTaxonomy.read_file(
                    taxonomy).taxonomy
            else:
                raise Exception(
                    "Programming error: Taxonomy is required somehow e.g. by --taxonomy or --rerooted_annotated_tree"
                )

            taxonomy_definition = {
                x: taxonomy_definition[x]
                for x in taxonomy_definition if x in taxonomy_to_keep
            }

            gtns.write_taxonomy_and_seqinfo_files(taxonomy_definition, tax,
                                                  seq)

            # Create the reference package
            logging.info("Creating reference package")
            refpkg = self._taxit_create(base, deduplicated_alignment_file,
                                        tre_file, log_file, tax, seq, refpkg,
                                        no_reroot)
        if sequences:
            # Run diamond makedb
            logging.info("Creating diamond database")
            if ptype == Create._PROTEIN_PACKAGE_TYPE:
                cmd = "diamond makedb --in '%s' -d '%s'" % (sequences, base)
                extern.run(cmd)
                diamondb = '%s.dmnd' % base
            elif ptype == Create._NUCLEOTIDE_PACKAGE_TYPE:
                diamondb = None
            else:
                raise Exception("Programming error")
        else:
            diamondb = None

        if sequences:
            # Get range
            max_range = self._define_range(sequences)
        else:
            max_range = self._define_range(alignment)

        # Compile the gpkg
        logging.info("Compiling gpkg")

        GraftMPackageVersion3.compile(output_gpkg_path,
                                      refpkg,
                                      align_hmm,
                                      diamondb,
                                      max_range,
                                      sequences,
                                      search_hmm_files=search_hmm_files)

        logging.info("Cleaning up")
        self._cleanup(self.the_trash)

        # Test out the gpkg just to be sure.
        #
        # TODO: Use graftM through internal means rather than via extern. This
        # requires some refactoring so that graft() can be called easily with
        # sane defaults.
        logging.info("Testing gpkg package works")
        self._test_package(output_gpkg_path)

        logging.info("Finished\n")
Ejemplo n.º 18
0
    def update(self, **kwargs):
        '''
        Update an existing GraftM package with new sequences and taxonomy. If no
        taxonomy is provided, attempt to decorate the new sequences with
        pre-existing taxonomy.

        Parameters
        ----------
        input_sequence_path: str
            Path to FASTA file containing sequences to add to the update GraftM
            package
        input_taxonomy_path: str
            Taxonomy corresponding to the sequences in input_sequence_path. If None,
            then attempt to assign taxonomy by decorating the tree made out of all
            sequences.
        input_graftm_package_path: str
            Path to the directory of the GraftM package that is to be updated
        output_graftm_package_path: str
            Path to the directory to which the new GraftM package will be
            written to
        '''
        input_sequence_path = kwargs.pop('input_sequence_path')
        input_taxonomy_path = kwargs.pop('input_taxonomy_path', None)
        input_graftm_package_path = kwargs.pop('input_graftm_package_path')
        output_graftm_package_path = kwargs.pop('output_graftm_package_path')
        threads = kwargs.pop('threads', UpdateDefaultOptions.threads) #TODO: add to user options
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        logging.info("Reading previous GraftM package")
        old_gpkg = GraftMPackage.acquire(input_graftm_package_path)
        min_input_version = 3
        if old_gpkg.version < min_input_version:
            raise InsufficientGraftMPackageVersion(
                "GraftM below version %s cannot be updated using the update function." % min_input_version +
                " Unaligned sequences are not included in these packages, therefore no new"
                " alignment/HMM/Tree can be created")

        new_gpkg = UpdatedGraftMPackage()
        new_gpkg.output = output_graftm_package_path
        new_gpkg.name = output_graftm_package_path.replace(".gpkg", "")

        #######################################
        ### Collect all unaligned sequences ###
        logging.info("Concatenating unaligned sequence files")
        new_gpkg.unaligned_sequences = "%s_sequences.fa" % (new_gpkg.name) #TODO: replace hard-coded paths like this with tempfiles
        self._concatenate_file([old_gpkg.unaligned_sequence_database_path(),
                                input_sequence_path],
                               new_gpkg.unaligned_sequences)

        #########################################################
        ### Parse taxonomy info up front so errors come early ###
        if input_taxonomy_path:
            logging.info("Reading new taxonomy information")
            input_taxonomy = GreenGenesTaxonomy.read_file(input_taxonomy_path)
            original_taxonomy_hash = old_gpkg.taxonomy_hash()
            total_taxonomy_hash = original_taxonomy_hash.copy()
            total_taxonomy_hash.update(input_taxonomy.taxonomy)
            num_duplicate_taxonomies = len(total_taxonomy_hash) - \
                                       len(input_taxonomy.taxonomy) - \
                                       len(original_taxonomy_hash)
            logging.debug("Found %i taxonomic definitions in common between the previous and updated taxonomies" % num_duplicate_taxonomies)
            if num_duplicate_taxonomies > 0:
                logging.warn("Found %i taxonomic definitions in common between the previous and updated taxonomies. Using the updated taxonomy in each case." % num_duplicate_taxonomies)

        ###############################
        ### Re-construct alignments ###
        logging.info("Multiple sequence aligning all sequences")
        new_gpkg.aligned_sequences = "%s_mafft_alignment.fa" % (new_gpkg.name)
        self._align_sequences(new_gpkg.unaligned_sequences, new_gpkg.aligned_sequences, threads)

        ########################
        ### Re-construct HMM ###
        logging.info("Creating HMM from alignment")
        new_gpkg.hmm = "%s.hmm" % (new_gpkg.name)
        new_gpkg.hmm_alignment = "%s_hmm_alignment.fa" % (new_gpkg.name)
        self._get_hmm_from_alignment(new_gpkg.aligned_sequences, new_gpkg.hmm, new_gpkg.hmm_alignment)

        #########################
        ### Re-construct tree ###
        logging.info("Generating phylogenetic tree")
        new_gpkg.unrooted_tree = "%s.tre" % (new_gpkg.name)
        new_gpkg.unrooted_tree_log = "%s.tre.log" % (new_gpkg.name)
        new_gpkg.package_type, new_gpkg.hmm_length = self._pipe_type(old_gpkg.alignment_hmm_path())
        new_gpkg.unrooted_gpkg_tree_log, new_gpkg.unrooted_gpkg_tree = \
            self._build_tree(new_gpkg.hmm_alignment, new_gpkg.name,
                             new_gpkg.package_type, self.fasttree)

        ##############################################
        ### Re-root and decorate tree if necessary ###
        if input_taxonomy_path:
            new_gpkg.gpkg_tree_log = new_gpkg.unrooted_tree_log
            new_gpkg.gpkg_tree = new_gpkg.unrooted_gpkg_tree
        else:
            logging.info("Finding taxonomy for new sequences")
            rerooter = Rerooter()
            
            old_tree = Tree.get(path=old_gpkg.reference_package_tree_path(),
                                schema='newick')
            new_tree = Tree.get(path=new_gpkg.unrooted_gpkg_tree,
                                schema='newick')
            old_tree = rerooter.reroot(old_tree)
            new_tree = rerooter.reroot(new_tree)
            # TODO: Shouldn't call an underscore method, eventually use
            # Rerooter instead.
            rerooted_tree = rerooter.reroot_by_tree(old_tree, new_tree)
            new_gpkg.gpkg_tree = "%s_gpkg.tree" % new_gpkg.name
            td = TreeDecorator(
                rerooted_tree,
                old_gpkg.taxtastic_taxonomy_path(),
                old_gpkg.taxtastic_seqinfo_path())
            
            with tempfile.NamedTemporaryFile(suffix='tsv') as taxonomy:
                td.decorate(new_gpkg.gpkg_tree, taxonomy.name, True) 
                total_taxonomy_hash = GreenGenesTaxonomy.read_file(taxonomy.name).taxonomy

            ################################
            ### Generating tree log file ###
            logging.info("Generating phylogenetic tree log file")
            new_gpkg.gpkg_tree = "%s_gpkg.tree" % new_gpkg.name
            new_gpkg.gpkg_tree_log = "%s_gpkg.tree.log" % new_gpkg.name
            self._generate_tree_log_file(new_gpkg.unrooted_tree,
                                         new_gpkg.hmm_alignment,
                                         new_gpkg.gpkg_tree,
                                         new_gpkg.gpkg_tree_log,
                                         new_gpkg.package_type,
                                         self.fasttree)

        ################################
        ### Creating taxtastic files ###
        logging.info("Writing new taxonomy files")
        new_gpkg.tt_seqinfo = "%s_seqinfo.csv" % new_gpkg.name
        new_gpkg.tt_taxonomy = "%s_taxonomy.csv" % new_gpkg.name
        gtns = Getaxnseq()

        gtns.write_taxonomy_and_seqinfo_files(
            total_taxonomy_hash,
            new_gpkg.tt_taxonomy,
            new_gpkg.tt_seqinfo)
        
        ######################
        ### Compile refpkg ###
        logging.info("Compiling pplacer refpkg")
        new_gpkg.refpkg = "%s.refpkg" % (new_gpkg.name)
        refpkg = self._taxit_create(new_gpkg.name,
                                    new_gpkg.hmm_alignment,
                                    new_gpkg.gpkg_tree,
                                    new_gpkg.gpkg_tree_log,
                                    new_gpkg.tt_taxonomy,
                                    new_gpkg.tt_seqinfo,
                                    new_gpkg.refpkg,
                                    True)

        #####################################
        ### Re-construct diamond database ###
        logging.info("Recreating DIAMOND DB")
        new_gpkg.diamond_database = "%s.dmnd" % (new_gpkg.name)
        self._create_dmnd_database(new_gpkg.unaligned_sequences, new_gpkg.name)

        ####################
        ### Compile gpkg ###
        logging.info("Compiling GraftM package")
        new_gpkg.name = "%s.gpkg" % new_gpkg.name
        GraftMPackageVersion3.compile(new_gpkg.name, new_gpkg.refpkg,
                                      new_gpkg.hmm, new_gpkg.diamond_database,
                                      self._define_range(new_gpkg.unaligned_sequences),
                                      new_gpkg.unaligned_sequences,
                                      search_hmm_files=old_gpkg.search_hmm_paths())

        ###################
        ### Test it out ###
        logging.info("Testing newly updated GraftM package works")
        self._test_package(new_gpkg.name)

        logging.info("Finished")
Ejemplo n.º 19
0
 def test_raises_when_incorrect_num_fields(self):
     with self.assertRaises(MalformedGreenGenesTaxonomyException):
         GreenGenesTaxonomy.read(StringIO('seq1\tbacteria;cyanobacteria\n'\
                                        'seq2\n'
                                        ))
Ejemplo n.º 20
0
    def run_to_otu_table(self, **kwargs):
        '''Run the pipe, '''
        forward_read_files = kwargs.pop('sequences')
        num_threads = kwargs.pop('threads')
        known_otu_tables = kwargs.pop('known_otu_tables')
        singlem_assignment_method = kwargs.pop('assignment_method')
        output_jplace = kwargs.pop('output_jplace')
        evalue = kwargs.pop('evalue')
        min_orf_length = kwargs.pop('min_orf_length')
        restrict_read_length = kwargs.pop('restrict_read_length')
        filter_minimum_protein = kwargs.pop('filter_minimum_protein')
        filter_minimum_nucleotide = kwargs.pop('filter_minimum_nucleotide')
        include_inserts = kwargs.pop('include_inserts')
        singlem_packages = kwargs.pop('singlem_packages')
        assign_taxonomy = kwargs.pop('assign_taxonomy')
        known_sequence_taxonomy = kwargs.pop('known_sequence_taxonomy')

        working_directory = kwargs.pop('working_directory')
        force = kwargs.pop('force')
        if len(kwargs) > 0:
            raise Exception("Unexpected arguments detected: %s" % kwargs)

        self._num_threads = num_threads
        self._evalue = evalue
        self._min_orf_length = min_orf_length
        self._restrict_read_length = restrict_read_length
        self._filter_minimum_protein = filter_minimum_protein
        self._filter_minimum_nucleotide = filter_minimum_nucleotide

        hmms = HmmDatabase(singlem_packages)
        if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD:
            graftm_assignment_method = DIAMOND_ASSIGNMENT_METHOD
        else:
            graftm_assignment_method = singlem_assignment_method

        if logging.getLevelName(logging.getLogger().level) == 'DEBUG':
            self._graftm_verbosity = '5'
        else:
            self._graftm_verbosity = '2'

        if not assign_taxonomy:
            singlem_assignment_method = NO_ASSIGNMENT_METHOD

        using_temporary_working_directory = working_directory is None
        if using_temporary_working_directory:
            shared_mem_directory = '/dev/shm'
            if os.path.exists(shared_mem_directory):
                logging.debug("Using shared memory as a base directory")
                tmp = tempdir.TempDir(basedir=shared_mem_directory)
                tempfiles_path = os.path.join(tmp.name, 'tempfiles')
                os.mkdir(tempfiles_path)
                os.environ['TEMP'] = tempfiles_path
            else:
                logging.debug(
                    "Shared memory directory not detected, using default temporary directory instead"
                )
                tmp = tempdir.TempDir()
            working_directory = tmp.name
        else:
            working_directory = working_directory
            if os.path.exists(working_directory):
                if force:
                    logging.info("Overwriting directory %s" %
                                 working_directory)
                    shutil.rmtree(working_directory)
                    os.mkdir(working_directory)
                else:
                    raise Exception(
                        "Working directory '%s' already exists, not continuing"
                        % working_directory)
            else:
                os.mkdir(working_directory)
        logging.debug("Using working directory %s" % working_directory)
        self._working_directory = working_directory
        extracted_reads = None

        def return_cleanly():
            if using_temporary_working_directory: tmp.dissolve()
            logging.info("Finished")

        #### Search
        self._singlem_package_database = hmms
        search_result = self._search(hmms, forward_read_files)
        sample_names = search_result.samples_with_hits()
        if len(sample_names) == 0:
            logging.info("No reads identified in any samples, stopping")
            return_cleanly()
            return None
        logging.debug("Recovered %i samples with at least one hit e.g. '%s'" \
                     % (len(sample_names), sample_names[0]))

        #### Alignment
        align_result = self._align(search_result)

        ### Extract reads that have already known taxonomy
        if known_otu_tables:
            logging.info("Parsing known taxonomy OTU tables")
            known_taxes = KnownOtuTable()
            known_taxes.parse_otu_tables(known_otu_tables)
            logging.debug("Read in %i sequences with known taxonomy" %
                          len(known_taxes))
        else:
            known_taxes = []
        if known_sequence_taxonomy:
            logging.debug("Parsing sequence-wise taxonomy..")
            tax1 = GreenGenesTaxonomy.read(
                open(known_sequence_taxonomy)).taxonomy
            known_sequence_tax = {}
            for seq_id, tax in tax1.items():
                known_sequence_tax[seq_id] = '; '.join(tax)
            logging.info(
                "Read in %i taxonomies from the GreenGenes format taxonomy file"
                % len(known_sequence_tax))

        ### Extract other reads which do not have known taxonomy
        extracted_reads = self._extract_relevant_reads(align_result,
                                                       include_inserts,
                                                       known_taxes)
        logging.info("Finished extracting aligned sequences")

        #### Taxonomic assignment
        if assign_taxonomy:
            logging.info("Running taxonomic assignment with GraftM..")
            assignment_result = self._assign_taxonomy(
                extracted_reads, graftm_assignment_method)

        #### Process taxonomically assigned reads
        # get the sequences out for each of them
        otu_table_object = OtuTable()
        if singlem_assignment_method == PPLACER_ASSIGNMENT_METHOD:
            package_to_taxonomy_bihash = {}

        for readset in extracted_reads:
            sample_name = readset.sample_name
            singlem_package = readset.singlem_package
            known_sequences = readset.known_sequences

            def add_info(infos, otu_table_object, known_tax):
                for info in infos:
                    to_print = [
                        singlem_package.graftm_package_basename(), sample_name,
                        info.seq, info.count, info.coverage, info.taxonomy,
                        info.names, info.aligned_lengths, known_tax
                    ]
                    otu_table_object.data.append(to_print)

            known_infos = self._seqs_to_counts_and_taxonomy(
                known_sequences, NO_ASSIGNMENT_METHOD, known_taxes,
                known_sequence_taxonomy, None)
            add_info(known_infos, otu_table_object, True)

            if len(
                    readset.unknown_sequences
            ) > 0:  # if any sequences were aligned (not just already known)
                tmpbase = readset.tmpfile_basename

                if assign_taxonomy:
                    is_known_taxonomy = False
                    aligned_seqs = list(
                        itertools.chain(readset.unknown_sequences,
                                        readset.known_sequences))

                    if singlem_assignment_method == DIAMOND_EXAMPLE_BEST_HIT_ASSIGNMENT_METHOD:
                        tax_file = assignment_result.diamond_assignment_file(
                            sample_name, singlem_package, tmpbase)
                        taxonomies = DiamondResultParser(tax_file)
                    elif singlem_assignment_method == DIAMOND_ASSIGNMENT_METHOD:
                        tax_file = assignment_result.read_tax_file(
                            sample_name, singlem_package, tmpbase)
                        if not os.path.isfile(tax_file):
                            logging.warn(
                                "Unable to find tax file for gene %s from sample %s "
                                "(likely do to min length filtering), skipping"
                                % (os.path.basename(
                                    singlem_package.base_directory()),
                                   sample_name))
                            taxonomies = {}
                        else:
                            taxonomies = TaxonomyFile(tax_file)

                    elif singlem_assignment_method == PPLACER_ASSIGNMENT_METHOD:
                        bihash_key = singlem_package.base_directory()
                        if bihash_key in package_to_taxonomy_bihash:
                            taxonomy_bihash = package_to_taxonomy_bihash[
                                bihash_key]
                        else:
                            taxtastic_taxonomy = singlem_package.graftm_package(
                            ).taxtastic_taxonomy_path()
                            logging.debug(
                                "Reading taxtastic taxonomy from %s" %
                                taxtastic_taxonomy)
                            with open(taxtastic_taxonomy) as f:
                                taxonomy_bihash = TaxonomyBihash.parse_taxtastic_taxonomy(
                                    f)
                            package_to_taxonomy_bihash[
                                bihash_key] = taxonomy_bihash
                        base_dir = assignment_result._base_dir(
                            sample_name, singlem_package, tmpbase)
                        jplace_file = os.path.join(base_dir,
                                                   "placements.jplace")
                        logging.debug(
                            "Attempting to read jplace output from %s" %
                            jplace_file)
                        if os.path.exists(jplace_file):
                            with open(jplace_file) as f:
                                jplace_json = json.loads(f.read())
                            placement_parser = PlacementParser(
                                jplace_json, taxonomy_bihash, 0.5)
                        else:
                            # Sometimes alignments are filtered out.
                            placement_parser = None
                        taxonomies = {}
                    elif singlem_assignment_method == NO_ASSIGNMENT_METHOD:
                        taxonomies = {}
                    else:
                        raise Exception("Programming error")

                else:  # Taxonomy has not been assigned.
                    aligned_seqs = readset.unknown_sequences
                    if known_sequence_taxonomy:
                        taxonomies = known_sequence_tax
                    else:
                        taxonomies = {}
                    is_known_taxonomy = True

                new_infos = list(
                    self._seqs_to_counts_and_taxonomy(
                        aligned_seqs, singlem_assignment_method,
                        known_sequence_tax if known_sequence_taxonomy else {},
                        taxonomies,
                        placement_parser if singlem_assignment_method
                        == PPLACER_ASSIGNMENT_METHOD else None))
                add_info(new_infos, otu_table_object, is_known_taxonomy)

                if output_jplace:
                    base_dir = assignment_result._base_dir(
                        sample_name, singlem_package, tmpbase)
                    input_jplace_file = os.path.join(base_dir,
                                                     "placements.jplace")
                    output_jplace_file = "%s_%s_%s.jplace" % (
                        output_jplace, sample_name,
                        singlem_package.graftm_package_basename())
                    logging.info("Writing jplace file '%s'" %
                                 output_jplace_file)
                    logging.debug(
                        "Converting jplace file %s to singlem jplace file %s" %
                        (input_jplace_file, output_jplace_file))
                    with open(output_jplace_file, 'w') as output_jplace_io:
                        self._write_jplace_from_infos(open(input_jplace_file),
                                                      new_infos,
                                                      output_jplace_io)
        return_cleanly()
        return otu_table_object