예제 #1
0
    def run(self, genome_files, scaffold_file, min_seq_len):
        """Fragment genome sequences into fragments of a fixed size.

        Parameters
        ----------
        genome_files : list of str
            Fasta files of genomes to process.
        scaffold_file : str
            Scaffolds binned to generate putative genomes.
        min_seq_len : int
            Ignore scaffolds shorter than the specified length.

        Returns
        -------
        dict : d[seq_id] -> seq
            Dictionary of unbinned sequences.
        """

        check_file_exists(scaffold_file)

        # get list of sequences in bins
        self.logger.info('Reading binned scaffolds.')

        binned_seq_ids = set()
        total_binned_bases = 0
        for genome_file in genome_files:
            for seq_id, seq in seq_io.read_seq(genome_file):
                binned_seq_ids.add(seq_id)
                total_binned_bases += len(seq)

        self.logger.info(
            'Read %d (%.2f Mbp) binned scaffolds.' %
            (len(binned_seq_ids), float(total_binned_bases) / 1e6))

        # write all unbinned sequences
        self.logger.info('Identifying unbinned scaffolds >= %d bp.' %
                         min_seq_len)

        unbinned_bases = 0
        unbinned_seqs = {}
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id not in binned_seq_ids and len(seq) >= min_seq_len:
                unbinned_seqs[seq_id] = seq
                unbinned_bases += len(seq)

        self.logger.info('Identified %d (%.2f Mbp) unbinned scaffolds.' %
                         (len(unbinned_seqs), float(unbinned_bases) / 1e6))

        self.logger.info('Percentage of unbinned scaffolds: %.2f%%' %
                         (len(unbinned_seqs) * 100.0 /
                          (len(unbinned_seqs) + len(binned_seq_ids))))
        self.logger.info('Percentage of unbinned bases: %.2f%%' %
                         (unbinned_bases * 100.0 /
                          (unbinned_bases + total_binned_bases)))

        return unbinned_seqs
예제 #2
0
    def run(self, genome_files, scaffold_file, min_seq_len):
        """Fragment genome sequences into fragments of a fixed size.

        Parameters
        ----------
        genome_files : list of str
            Fasta files of genomes to process.
        scaffold_file : str
            Scaffolds binned to generate putative genomes.
        min_seq_len : int
            Ignore scaffolds shorter than the specified length.

        Returns
        -------
        dict : d[seq_id] -> seq
            Dictionary of unbinned sequences.
        """

        check_file_exists(scaffold_file)

        # get list of sequences in bins
        self.logger.info('')
        self.logger.info('  Reading binned scaffolds.')

        binned_seq_ids = set()
        total_binned_bases = 0
        for genome_file in genome_files:
            for seq_id, seq in seq_io.read_seq(genome_file):
                binned_seq_ids.add(seq_id)
                total_binned_bases += len(seq)

        self.logger.info('    Read %d (%.2f Mbp) binned scaffolds.' % (len(binned_seq_ids), float(total_binned_bases) / 1e6))

        # write all unbinned sequences
        self.logger.info('')
        self.logger.info('  Identifying unbinned scaffolds >= %d bp.' % min_seq_len)

        unbinned_bases = 0
        unbinned_seqs = {}
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id not in binned_seq_ids and len(seq) >= min_seq_len:
                unbinned_seqs[seq_id] = seq
                unbinned_bases += len(seq)

        self.logger.info('    Identified %d (%.2f Mbp) unbinned scaffolds.' % (len(unbinned_seqs), float(unbinned_bases) / 1e6))

        self.logger.info('')
        self.logger.info('  Percentage of unbinned scaffolds: %.2f%%' % (len(unbinned_seqs) * 100.0 / (len(unbinned_seqs) + len(binned_seq_ids))))
        self.logger.info('  Percentage of unbinned bases: %.2f%%' % (unbinned_bases * 100.0 / (unbinned_bases + total_binned_bases)))

        return unbinned_seqs
예제 #3
0
    def _fragment_genomes(self, genome_file, window_size, step_size, profile,
                          fout):
        """Fragment genome sequences into fragments of a fixed size.

        This is a helper function for fragmenting sequences within
        a genome which will be classified in order to create a
        taxonomic profile.

        Parameters
        ----------
        genome_file : str
            Fasta file with genome sequences to fragment.
        window_size : int
            Size of each fragment.
        step_size : int
            Number of bases to move after each window.
        profile : Profile
            Class for classifying fragments.
        fout : stream
            Output stream to store all fragments.
        """

        for seq_id, seq in seq_io.read_seq(genome_file):
            fragments = seq_tk.fragment(seq, window_size, step_size)
            for i, frag in enumerate(fragments):
                fout.write('>' + seq_id + '~' + str(i) + '\n')
                fout.write(frag + '\n')

                profile.fragments_from_seq[seq_id] = len(fragments)
                profile.seq_len[seq_id] = len(seq)
예제 #4
0
파일: common.py 프로젝트: wwood/RefineM
def concatenate_gene_files(gene_files, concatenated_gene_file):
    """Combine all gene files into a single file.

    Gene ids are modified to include genome ids in order to ensure
    all gene identifiers are unique across the set of genomes.

    Parameters
    ----------
    gene_files : list of str
        Fasta files of called genes to process.
    concatenated_gene_file : str
        Name of file to contain concatenated gene files.
    """

    fout = open(concatenated_gene_file, 'w')

    for gf in gene_files:
        genome_id = remove_extension(gf)

        for seq_id, seq in seq_io.read_seq(gf):
            fout.write('>' + genome_id + '~' + seq_id + '\n')
            if seq[-1] == '*':
                seq = seq[0:-1]
            fout.write(seq + '\n')

    fout.close()
예제 #5
0
def read_bins(bin_dirs):
    """Read sequences in bins."""

    bins = defaultdict(lambda: defaultdict(set))
    contigs = {}
    contigs_in_bins = defaultdict(lambda: {})
    for method_id, (bin_dir, bin_ext) in bin_dirs.items():
        for bf in os.listdir(bin_dir):
            if not bf.endswith(bin_ext):
                continue

            bin_id = bf[0:bf.rfind(bin_ext)]
            if bin_id[-1] == '.':
                bin_id = bin_id[0:-1]
            bf_path = os.path.join(bin_dir, bf)

            for seq_id, seq in seq_io.read_seq(bf_path):
                bins[method_id][bin_id].add(seq_id)
                contigs[seq_id] = seq
                contigs_in_bins[seq_id][method_id] = bin_id

            if len(bins[method_id][bin_id]) == 0:
                self.logger.warning('Bin %s from %s is empty.' %
                                    (bf, method_id))

    return bins, contigs, contigs_in_bins
예제 #6
0
    def _fragment_genomes(self, genome_file, window_size, step_size, profile, fout):
        """Fragment genome sequences into fragments of a fixed size.

        This is a helper function for fragmenting sequences within
        a genome which will be classified in order to create a
        taxonomic profile.

        Parameters
        ----------
        genome_file : str
            Fasta file with genome sequences to fragment.
        window_size : int
            Size of each fragment.
        step_size : int
            Number of bases to move after each window.
        profile : Profile
            Class for classifying fragments.
        fout : stream
            Output stream to store all fragments.
        """

        for seq_id, seq in seq_io.read_seq(genome_file):
            fragments = seq_tk.fragment(seq, window_size, step_size)
            for i, frag in enumerate(fragments):
                fout.write('>' + seq_id + '~' + str(i) + '\n')
                fout.write(frag + '\n')

                profile.fragments_from_seq[seq_id] = len(fragments)
                profile.seq_len[seq_id] = len(seq)
예제 #7
0
    def create_arb_metadata(self, msa_output, taxonomy, metadata, output_file):
        """Create metadata file suitable for import into ARB.

        Parameters
        ----------
        msa_output : str
            Fasta file with aligned homologs.
        taxonomy : d[genome_id] -> list of taxa
            Taxonomic information for genomes.
        metadata : d[key] - string
            Additional metadata to write to ARB file.
        output_file : str
            File to write metadata information.
        """

        arb_metadata_list = []
        for seq_id, seq in seq_io.read_seq(msa_output):
            arb_metadata = {}
            arb_metadata['db_name'] = seq_id
            arb_metadata['genome_id'] = seq_id
            arb_metadata['gtdb_tax_string'] = ';'.join(taxonomy.get(
                seq_id, ''))
            arb_metadata['aligned_seq'] = seq

            for k, v in metadata.iteritems():
                arb_metadata[k] = v

            arb_metadata_list.append(arb_metadata)

        fout = open(output_file, 'w')
        arb_parser = ArbParser()
        arb_parser.write(arb_metadata_list, fout)
        fout.close()
예제 #8
0
def concatenate_gene_files(gene_files, concatenated_gene_file):
    """Combine all gene files into a single file.

    Gene ids are modified to include genome ids in order to ensure
    all gene identifiers are unique across the set of genomes.

    Parameters
    ----------
    gene_files : list of str
        Fasta files of called genes to process.
    concatenated_gene_file : str
        Name of file to contain concatenated gene files.
    """

    fout = open(concatenated_gene_file, 'w')

    for gf in gene_files:
        genome_id = remove_extension(gf)

        for seq_id, seq in seq_io.read_seq(gf):
            fout.write('>' + seq_id + '~' + genome_id + '\n')
            if seq[-1] == '*':
                seq = seq[0:-1]
            fout.write(seq + '\n')

    fout.close()
예제 #9
0
 def run(self, ssu_gtdb_taxonomy_file, silva_parc_fasta_file):
     """Check INSDC primary accession numbers for GTDB SSU sequences with those in SILVA."""
     
     print('Reading SILVA INSDC accession numbers.')
     silva_ids = set()
     for seq_id, seq in seq_io.read_seq(silva_parc_fasta_file):
         silva_ids.add(seq_id)
     print('Read %d accession numbers.' % len(silva_ids))
         
     print('Checking GTDB SSU INSDC accession numbers.')
     missing_silva_acc = 0
     num_genes = 0
     for line in open(ssu_gtdb_taxonomy_file):
         line_split = line.strip().split('\t')
         
         gid = line_split[0]
         gene_id = line_split[1]
         gene_id = gene_id[0:gene_id.rfind('.')]
         start = int(line_split[2])
         stop = int(line_split[3])
         accession = '%s.%d.%d' % (gene_id, start, stop)
         num_genes += 1
         
         if accession not in silva_ids and (stop-start) > 300:
             print('Missing INSDC accession in SILVA for genome %s: %s (len=%d)' % (gid, accession, stop-start))
             missing_silva_acc += 1
             
     print('Identified %d of %d (%.2f%%) genes without a SILVA accession.' % (missing_silva_acc, num_genes, missing_silva_acc*100.0/num_genes))
예제 #10
0
    def add_compatible(self, scaffold_file, genome_file, compatible_file, min_len, out_genome):
        """Add sequences specified as compatible.

        Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds to add.
        genome_file : str
            Fasta file of binned scaffolds.
        compatible_file : str
            File specifying compatible scaffolds.
        min_len : int
            Minimum length to add scaffold.
        out_genome : str
            Name of output genome.
        """

        cur_bin_id = remove_extension(genome_file)

        # determine statistics for each potentially compatible scaffold
        scaffold_ids = set()
        with open(compatible_file) as f:
            headers = [x.strip() for x in f.readline().split('\t')]
            scaffold_gc_index = headers.index('Scaffold GC')
            genome_gc_index = headers.index('Median genome GC')
            td_dist_index = headers.index('Scaffold TD')
            scaffold_cov_index = headers.index('Scaffold coverage')
            genome_cov_index = headers.index('Median genome coverage')

            for line in f:
                line_split = line.split('\t')
                scaffold_id = line_split[0]
                bin_id = line_split[1].strip()

                scaffold_gc = float(line_split[scaffold_gc_index])
                genome_gc = float(line_split[genome_gc_index])
                gc_dist = abs(scaffold_gc - genome_gc)

                td_dist = float(line_split[td_dist_index])

                scaffold_cov = float(line_split[scaffold_cov_index])
                genome_cov = float(line_split[genome_cov_index])
                cov_dist = abs(scaffold_cov - genome_cov)

                if bin_id == cur_bin_id:
                    scaffold_ids.add(scaffold_id)

        # add compatible sequences to genome
        added_seqs = 0
        genome_seqs = seq_io.read(genome_file)
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id in scaffold_ids:
                if len(seq) >= min_len:
                    genome_seqs[seq_id] = seq
                    added_seqs += 1
                
        self.logger.info('Added {:,} scaffolds meeting length criterion.'.format(added_seqs))

        # save modified bin
        seq_io.write_fasta(genome_seqs, out_genome)
예제 #11
0
파일: outliers.py 프로젝트: wwood/RefineM
    def add_compatible_unique(self, scaffold_file, genome_file,
                              compatible_file, min_len, out_genome):
        """Add sequences specified as compatible.

        Only sequences specified exactly once in the
        compatibility file are added.

        Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds to add.
        genome_file : str
            Fasta file of binned scaffolds.
        compatible_file : str
            File specifying compatible scaffolds.
        min_len : int
            Minimum length to add scaffold.
        out_genome : str
            Name of output genome.
        """

        cur_bin_id = remove_extension(genome_file)

        # determine scaffolds compatible with genome
        scaffold_ids = []
        bin_ids = {}
        with open(compatible_file) as f:
            f.readline()

            for line in f:
                line_split = line.split('\t')
                scaffold_id = line_split[0]
                bin_id = line_split[1].strip()

                scaffold_ids.append(scaffold_id)
                bin_ids[scaffold_id] = bin_id

        compatible_scaffolds = set()
        for scaffold_id, bin_id in bin_ids.iteritems():
            if scaffold_ids.count(scaffold_id) == 1 and bin_id == cur_bin_id:
                compatible_scaffolds.add(scaffold_id)

        self.logger.info('Identified %d compatible scaffolds.' %
                         len(compatible_scaffolds))

        # add compatible sequences to genome
        added_seqs = 0
        genome_seqs = seq_io.read(genome_file)
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id in compatible_scaffolds:
                if len(seq) >= min_len:
                    genome_seqs[seq_id] = seq
                    added_seqs += 1

        self.logger.info('Added %d scaffolds meeting length criterion.' %
                         added_seqs)

        # save modified bin
        seq_io.write_fasta(genome_seqs, out_genome)
예제 #12
0
    def run(self, gtdb_bac_taxonomy_file, gtdb_ar_taxonomy_file, silva_ssu_ref,
            silva_lsu_ref, ssu_blast_table, lsu_blast_table, output_dir):
        """Create table assigning GTDB taxonomy to SILVA accessions based on SSU and LSU BLAST results."""

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # read GTDB taxonomy
        print('Reading GTDB taxonomy.')
        gtdb_bac_taxonomy = Taxonomy().read(gtdb_bac_taxonomy_file)
        gtdb_ar_taxonomy = Taxonomy().read(gtdb_ar_taxonomy_file)
        gtdb_taxonomy = gtdb_bac_taxonomy.copy()
        gtdb_taxonomy.update(gtdb_ar_taxonomy)

        print('Identified %d bacterial genomes to process.' %
              len(gtdb_bac_taxonomy))
        print('Identified %d archaeal genomes to process.' %
              len(gtdb_ar_taxonomy))
        print('Identified %d genomes to process.' % len(gtdb_taxonomy))

        # read SILVA taxonomy
        print('Reading SILVA 16S and 23S rRNA taxonomies.')
        silva_ssu_taxonomy = {}
        for seq_id, seq, taxonomy in seq_io.read_seq(silva_ssu_ref,
                                                     keep_annotation=True):
            silva_ssu_taxonomy[seq_id] = taxonomy

        silva_lsu_taxonomy = {}
        for seq_id, seq, taxonomy in seq_io.read_seq(silva_lsu_ref,
                                                     keep_annotation=True):
            silva_lsu_taxonomy[seq_id] = taxonomy

        # parse BLAST tables
        print('Parsing BLAST tables.')

        ssu_table = os.path.join(output_dir, 'ssu_silva.tsv')
        self._parse_blast_table(ssu_blast_table, gtdb_taxonomy,
                                silva_ssu_taxonomy, self.min_ssu_len,
                                ssu_table)

        lsu_table = os.path.join(output_dir, 'lsu_silva.tsv')
        self._parse_blast_table(lsu_blast_table, gtdb_taxonomy,
                                silva_lsu_taxonomy, self.min_lsu_len,
                                lsu_table)
예제 #13
0
    def read_msa_file(self, msa_file):
        """Determine percentage of amino acids for each genome in MSA file."""

        msa_perc = {}
        for seq_id, seq in read_seq(msa_file):
            seq = seq.upper()
            aa = len(seq) - seq.count('-') - seq.count('_') - seq.count('*')
            msa_perc[seq_id] = aa * 100.0 / len(seq)

        return msa_perc
예제 #14
0
    def _gene_distribution(self, seq_file):
        """Calculate length distribution of sequences."""

        gene_lens = []
        for seq_id, seq in seq_io.read_seq(seq_file):
            gene_lens.append(len(seq))

        p10, p50, p90 = np_percentile(gene_lens, [10, 50, 90])

        return np_mean(gene_lens), max(gene_lens), min(
            gene_lens), p10, p50, p90
예제 #15
0
    def _remove_stop_codons(self, input_file, output_file):
        """Remove stop codons at end of sequences."""

        fout = open(output_file, 'w')
        for seq_id, seq, annotation in seq_io.read_seq(input_file,
                                                       keep_annotation=True):
            fout.write('>%s %s\n' % (seq_id, annotation))

            if seq[-1] == '*':
                seq = seq[0:-1]
            fout.write('%s\n' % seq)
        fout.close()
예제 #16
0
    def place_genomes(self, user_msa_file, marker_set_id, out_dir, prefix):
        """Place genomes into reference tree using pplacer."""

        # rename user MSA file for compatibility with pplacer
        if not user_msa_file.endswith('.fasta'):
            t = os.path.join(out_dir, prefix + '.user_msa.fasta')
            shutil.copyfile(user_msa_file, t)
            user_msa_file = t

        # run pplacer to place bins in reference genome tree
        num_genomes = sum([1 for _seq_id, _seq in read_seq(user_msa_file)])

        # get path to pplacer reference package
        if marker_set_id == 'bac120':
            self.logger.info(
                'Placing %d bacterial genomes into reference tree with pplacer (be patient).'
                % num_genomes)
            pplacer_ref_pkg = os.path.join(Config.PPLACER_DIR,
                                           Config.PPLACER_BAC120_REF_PKG)
        elif marker_set_id == 'ar122':
            self.logger.info(
                'Placing %d archaeal genomes into reference tree with pplacer (be patient).'
                % num_genomes)
            pplacer_ref_pkg = os.path.join(Config.PPLACER_DIR,
                                           Config.PPLACER_AR122_REF_PKG)
        elif marker_set_id == 'rps23':
            self.logger.info(
                'Placing %d genomes into reference tree with pplacer (be patient).'
                % num_genomes)
            pplacer_ref_pkg = os.path.join(Config.PPLACER_DIR,
                                           Config.PPLACER_RPS23_REF_PKG)

        pplacer_out_dir = os.path.join(out_dir, 'pplacer')
        if not os.path.exists(pplacer_out_dir):
            os.makedirs(pplacer_out_dir)

        pplacer_out = os.path.join(pplacer_out_dir,
                                   'pplacer.%s.out' % marker_set_id)
        pplacer_json_out = os.path.join(pplacer_out_dir,
                                        'pplacer.%s.json' % marker_set_id)
        cmd = 'pplacer -j %d -c %s -o %s %s > %s' % (
            self.cpus, pplacer_ref_pkg, pplacer_json_out, user_msa_file,
            pplacer_out)
        os.system(cmd)

        # extract tree
        tree_file = os.path.join(out_dir,
                                 prefix + ".%s.classify.tree" % marker_set_id)
        cmd = 'guppy tog -o %s %s' % (tree_file, pplacer_json_out)
        os.system(cmd)

        return tree_file
예제 #17
0
    def read_ssu_file(self, ssu_fasta_file):
        """Read length of SSU sequences for genomes."""

        ssu_length = {}
        for seq_id, seq in read_seq(ssu_fasta_file):
            gid = seq_id.split('~')[0]

            if gid in ssu_length and len(seq) < ssu_length[gid]:
                continue

            ssu_length[gid] = len(seq) - seq.upper().count('N')

        return ssu_length
예제 #18
0
    def _derep_msa(self, msa_file, selected_taxa, output_msa):
        """Dereplicate multiple sequence alignment."""

        selected_taxa_labels = set()
        for taxon in selected_taxa:
            selected_taxa_labels.add(taxon.label)

        fout = open(output_msa, 'w')
        for seq_id, seq, annotation in read_seq(msa_file,
                                                keep_annotation=True):
            if seq_id in selected_taxa_labels:
                fout.write('>%s %s\n' % (seq_id, annotation))
                fout.write('%s\n' % seq)
        fout.close()
예제 #19
0
파일: genome_tk.py 프로젝트: wwood/biolib
def modify(input_file, scaffold_file, seqs_to_add, seqs_to_remove,
           output_file):
    """Add or remove scaffolds from a fasta file.

    Parameters
    ----------
    input_file : str
        Fasta file to modify.
    scaffold_file : str
        Fasta file containing scaffolds to add.
    seqs_to_add: iterable
        Unique ids of scaffolds to add.
    seqs_to_remove : iterable
        Unique ids of scaffolds to remove.
    output_file : str
        Desired name of modified fasta file.

    Returns
    -------
    iterable, iterable
        Unique ids of sequences that could not be added,
        unique ids of sequences that could not be removed.
    """

    seqs = seq_io.read(input_file)

    # add sequences to bin
    failed_to_add = set()
    if seqs_to_add:
        failed_to_add = set(seqs_to_add)
        if seqs_to_add != None:
            for seq_id, seq in seq_io.read_seq(scaffold_file):
                if seq_id in seqs_to_add:
                    failed_to_add.remove(seq_id)
                    seqs[seq_id] = seq

    # remove sequences from bin
    failed_to_remove = set()
    if seqs_to_remove:
        failed_to_remove = set(seqs_to_remove)
        if seqs_to_remove != None:
            for seq_id in seqs_to_remove:
                if seq_id in seqs:
                    failed_to_remove.remove(seq_id)
                    seqs.pop(seq_id)

    # save modified bin
    seq_io.write_fasta(seqs, output_file)

    return failed_to_add, failed_to_remove
예제 #20
0
    def add_compatible_unique(self, scaffold_file, genome_file, compatible_file, out_genome):
        """Add sequences specified as compatible.

        Only sequences specified exactly once in the
        compatibility file are added.

        Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds to add.
        genome_file : str
            Fasta file of binned scaffolds.
        compatible_file : str
            File specifying compatible scaffolds.
        out_genome : str
            Name of output genome.
        """

        cur_bin_id = remove_extension(genome_file)

        # determine scaffolds compatible with genome
        scaffold_ids = []
        bin_ids = {}
        with open(compatible_file) as f:
            f.readline()

            for line in f:
                line_split = line.split('\t')
                scaffold_id = line_split[0]
                bin_id = line_split[1].strip()

                scaffold_ids.append(scaffold_id)
                bin_ids[scaffold_id] = bin_id

        compatible_scaffolds = set()
        for scaffold_id, bin_id in bin_ids.iteritems():
            if scaffold_ids.count(scaffold_id) == 1 and bin_id == cur_bin_id:
                compatible_scaffolds.add(scaffold_id)

        # add compatible sequences to genome
        genome_seqs = seq_io.read(genome_file)
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id in compatible_scaffolds:
                genome_seqs[seq_id] = seq

        # save modified bin
        seq_io.write_fasta(genome_seqs, out_genome)
예제 #21
0
파일: genome_tk.py 프로젝트: wwood/biolib
def unique(genome_files):
    """Check if sequences are assigned to multiple bins.

    Parameters
    ----------
    genome_files : iterable
        Path to genome fasta files.

    Returns
    -------
    dict : d[genome_id][genome_id] -> [shared sequences]
        List of any sequences within a genome observed multiple times.
    """

    # read sequence IDs from all genomes,
    # while checking for duplicate sequences within a genomes
    duplicates = defaultdict(lambda: defaultdict(list))

    genome_seqs = {}
    for f in genome_files:
        genome_id = remove_extension(f)

        seq_ids = set()
        for seq_id, _seq in seq_io.read_seq(f):
            if seq_id in seq_ids:
                duplicates[genome_id][genome_id].append(seq_id)

            seq_ids.add(seq_id)

        genome_seqs[genome_id] = seq_ids

    # check for sequences assigned to multiple bins
    genome_ids = genome_seqs.keys()
    for i in xrange(0, len(genome_ids)):
        seq_idsI = genome_seqs[genome_ids[i]]

        for j in xrange(i + 1, len(genome_ids)):
            seq_idsJ = genome_seqs[genome_ids[j]]

            seq_intersection = seq_idsI.intersection(seq_idsJ)

            if len(seq_intersection) > 0:
                duplicates[genome_ids[i]][genome_ids[j]] = seq_intersection
                duplicates[genome_ids[j]][genome_ids[i]] = seq_intersection

    return duplicates
예제 #22
0
파일: common.py 프로젝트: wwood/GeneTreeTk
def validate_seq_ids(query_proteins):
    """Ensure all sequence identifiers contain only acceptable characters.

    Parameters
    ----------
    query_proteins : str
        Fasta file containing query proteins.
    """

    invalid_chars = set('()[],;=')
    for seq_id, _seq in seq_io.read_seq(query_proteins):
        if any((c in invalid_chars) for c in seq_id):
            logging.getLogger('no_timestamp').error(
                'Invalid sequence header in file %s' % query_proteins)
            logging.getLogger('no_timestamp').error(
                'Sequence contains an invalid character: %s' % seq_id)
            logging.getLogger('no_timestamp').error(
                'Sequence identifiers must not contain the following characters: '
                + ''.join(invalid_chars))
            sys.exit()
예제 #23
0
    def _genome_seqs(self, genome_files):
        """Get unique id of sequences in each genome.

        Parameters
        ----------
        genome_files : iterable
            Genome files in fasta format.

        Returns
        -------
        dict: d[genome_id] -> set(seq_id1, ..., seq_idN)
            Ids of sequences in each genome.
        """

        genome_seqs = defaultdict(set)
        for genome_file in genome_files:
            genome_id = remove_extension(genome_file)
            for seq_id, _seq in seq_io.read_seq(genome_file):
                genome_seqs[genome_id].add(seq_id)

        return genome_seqs
예제 #24
0
    def _genome_seqs(self, genome_files):
        """Get unique id of sequences in each genome.

        Parameters
        ----------
        genome_files : iterable
            Genome files in fasta format.

        Returns
        -------
        dict: d[genome_id] -> set(seq_id1, ..., seq_idN)
            Ids of sequences in each genome.
        """

        genome_seqs = defaultdict(set)
        for genome_file in genome_files:
            genome_id = remove_extension(genome_file)
            for seq_id, _seq in seq_io.read_seq(genome_file):
                genome_seqs[genome_id].add(seq_id)

        return genome_seqs
예제 #25
0
 def write_windows(self,scaffold_file,output_dir,window_size,window_gap):
     '''
     --------------------------------------------------------------------
     Take a scaffold file in fasta format and prints a similarly name
     fasta file of the windows made from the scaffolds in the scaffold file.
     --------------------------------------------------------------------
     Input: scaffold_file
                 The name of the fasta file to turn into windows
     Output:
             Writes a links file and a file of windows
     '''
     seq_win_id={} #pairs a scaffold with the windows made from it
     window_dict={} #dictionary of windows_dict[win_id]=seq_win
     
     for seq_id, sequence in seq_io.read_seq(scaffold_file):
         win_id,seq_win=self.make_windows([seq_id,sequence], window_size, window_gap)
         seq_win_id[seq_id]=win_id
         for i in range(0,len(win_id)):
             window_dict[win_id[i]]=seq_win[i]
     
     
     filename=os.path.split(scaffold_file)[1]
     start=".".join(filename.split('.')[:-1])
     end=filename.split('.')[-1]
     window_file=os.path.join(output_dir,start+"windows."+end)    
     
     names = ['id','sequence']
     formats = []
     
     
     print len(window_dict)
     seq_io.write_fasta(window_dict,window_file)
     #self.write_fasta(window_dict,window_file)
         
     links_file=os.path.join(output_dir,"links_file.tsv")
     self.write_links(seq_win_id,links_file)
     return [window_file,links_file]
예제 #26
0
    def combine(self, ssu_msa, ssu_tree, lsu_msa, lsu_tree, output_dir):
        """Infer 16S + 23S tree spanning GTDB genomes."""

        # identify common 16S and 23S sequences
        ssu_seqs = {}
        for seq_id, seq, annotation in seq_io.read_seq(ssu_msa,
                                                       keep_annotation=True):
            genome_id = seq_id.split('~')[0]
            ssu_seqs[genome_id] = [seq, annotation]
        self.logger.info('Read %d SSU rRNA sequences.' % len(ssu_seqs))

        lsu_seqs = {}
        for seq_id, seq, annotation in seq_io.read_seq(lsu_msa,
                                                       keep_annotation=True):
            genome_id = seq_id.split('~')[0]
            lsu_seqs[genome_id] = [seq, annotation]
        self.logger.info('Read %d LSU rRNA sequences.' % len(lsu_seqs))

        common_seqs = set(ssu_seqs.keys()).intersection(set(lsu_seqs.keys()))
        self.logger.info('Identified %d sequences in common.' %
                         len(common_seqs))

        # identify incongruent taxonomic order classifcations between trees
        self.logger.info(
            'Identifying incongruent order-level taxonomic classifications between trees.'
        )
        ssu_taxonomy = Taxonomy().read_from_tree(ssu_tree)
        lsu_taxonomy = Taxonomy().read_from_tree(lsu_tree)

        order_index = Taxonomy.rank_labels.index('order')

        seqs_to_filter = set()
        for seq_id in common_seqs:
            ssu_order = ssu_taxonomy.get(seq_id)[order_index][3:]
            lsu_order = lsu_taxonomy.get(seq_id)[order_index][3:]

            # remove designator of paraphyletic orders
            # (since in the concatenated tree this may be resolved)
            ssu_order = ssu_order.split('_')[0]
            lsu_order = lsu_order.split('_')[0]

            if ssu_order != lsu_order:
                seqs_to_filter.add(seq_id)

        self.logger.info(
            'Identified %d sequences with incongruent classifcations.' %
            len(seqs_to_filter))
        common_seqs.difference_update(seqs_to_filter)

        # write out MSA
        concatenated_msa = os.path.join(output_dir, 'ssu_lsu_concatenated.fna')
        fout = open(concatenated_msa, 'w')
        for seq_id in common_seqs:
            fout.write('>%s %s %s\n' %
                       (seq_id, ssu_seqs[seq_id][1], lsu_seqs[seq_id][1]))
            fout.write('%s%s\n' % (ssu_seqs[seq_id][0], lsu_seqs[seq_id][0]))
        fout.close()

        # infer tree
        output_tree = os.path.join(output_dir, 'ssu_lsu_concatenated.tree')
        os.system('FastTreeMP -nosupport -nt -gtr -gamma %s > %s' %
                  (concatenated_msa, output_tree))
예제 #27
0
    def add_compatible_closest(self, scaffold_file, genome_file, compatible_file, min_len, out_genome):
        """Add sequences specified as compatible.

        A sequences is added to a bin if and only if it is
        closest to that bin in GC, tetranuclotide, and
        coverage space.

        Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds to add.
        genome_file : str
            Fasta file of binned scaffolds.
        compatible_file : str
            File specifying compatible scaffolds.
        min_len : int
            Minimum length to add scaffold.
        out_genome : str
            Name of output genome.
        """

        cur_bin_id = remove_extension(genome_file)

        # determine statistics for each potentially compatible scaffold
        scaffold_ids = defaultdict(dict)
        with open(compatible_file) as f:
            headers = [x.strip() for x in f.readline().split('\t')]
            scaffold_gc_index = headers.index('Scaffold GC')
            genome_gc_index = headers.index('Median genome GC')
            td_dist_index = headers.index('Scaffold TD')
            scaffold_cov_index = headers.index('Scaffold coverage')
            genome_cov_index = headers.index('Median genome coverage')

            for line in f:
                line_split = line.split('\t')
                scaffold_id = line_split[0]
                bin_id = line_split[1].strip()

                scaffold_gc = float(line_split[scaffold_gc_index])
                genome_gc = float(line_split[genome_gc_index])
                gc_dist = abs(scaffold_gc - genome_gc)

                td_dist = float(line_split[td_dist_index])

                scaffold_cov = float(line_split[scaffold_cov_index])
                genome_cov = float(line_split[genome_cov_index])
                cov_dist = abs(scaffold_cov - genome_cov)

                scaffold_ids[scaffold_id][bin_id] = [gc_dist, td_dist, cov_dist]

        # determine scaffolds that are closest to a single bin
        # in terms of GC, tetranucleotide distance, and coverage
        compatible_scaffolds = set()
        for scaffold_id, bin_stats in scaffold_ids.items():
            best_gc = [1e9, None]
            best_td = [1e9, None]
            best_cov = [1e9, None]
            for bin_id, stats in bin_stats.items():
                gc, td, cov = stats
                if gc < best_gc[0]:
                    best_gc = [gc, bin_id]
                if td < best_td[0]:
                    best_td = [td, bin_id]
                if cov < best_cov[0]:
                    best_cov = [cov, bin_id]

            # check if scaffold is closest to a single bin
            if (best_gc[1] == best_td[1] == best_cov[1]) and best_gc[1] == cur_bin_id:
                compatible_scaffolds.add(scaffold_id)
                
        self.logger.info('Identified {:,} compatible scaffolds.'.format(len(compatible_scaffolds)))

        # add compatible sequences to genome
        added_seqs = 0
        genome_seqs = seq_io.read(genome_file)
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id in compatible_scaffolds:
                if len(seq) >= min_len:
                    genome_seqs[seq_id] = seq
                    added_seqs += 1
                
        self.logger.info('Added {:,} scaffolds meeting length criterion.'.format(added_seqs))

        # save modified bin
        seq_io.write_fasta(genome_seqs, out_genome)
예제 #28
0
    def run(self, scaffold_file, genome_files, tetra_file, coverage_file, output_file):
        """Calculate statistics for scaffolds.

        Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds.
        genome_files : list of str
            Fasta files with binned scaffolds.
        tetra_file : str
            Tetranucleotide signatures for scaffolds.
        coverage_file : str
            Coverage profiles for scaffolds
        output_file : str
            Output file for scaffolds statistics.
        """

        tetra = Tetranucleotide(self.cpus)
        signatures = tetra.read(tetra_file)

        cov_profiles = None
        if coverage_file:
            coverage = Coverage(self.cpus)
            cov_profiles, _ = coverage.read(coverage_file)

        # determine bin assignment for each scaffold
        self.logger.info('')
        self.logger.info('  Determining scaffold statistics.')

        scaffold_id_genome_id = {}
        for gf in genome_files:
            genome_id = remove_extension(gf)
            for scaffold_id, _seq in seq_io.read_seq(gf):
                scaffold_id_genome_id[scaffold_id] = genome_id

        # write out scaffold statistics
        fout = open(output_file, 'w')
        fout.write('Scaffold id\tGenome Id\tGC\tLength (bp)')

        if cov_profiles:
            bam_ids = sorted(cov_profiles[cov_profiles.keys()[0]].keys())
            for bam_id in bam_ids:
                fout.write('\t' + bam_id)

        for kmer in tetra.canonical_order():
            fout.write('\t' + kmer)
        fout.write('\n')

        for scaffold_id, seq in seq_io.read_seq(scaffold_file):
            fout.write(scaffold_id)
            fout.write('\t' + scaffold_id_genome_id.get(scaffold_id, self.unbinned))
            fout.write('\t%.2f' % (seq_tk.gc(seq) * 100.0))
            fout.write('\t%d' % len(seq))

            if cov_profiles:
                for bam_id in bam_ids:
                    fout.write('\t%.2f' % cov_profiles[scaffold_id][bam_id])

            fout.write('\t' + '\t'.join(map(str, signatures[scaffold_id])))
            fout.write('\n')

        fout.close()
예제 #29
0
    def run_seqs_file(self, producer, consumer, seq_file, progress=None):
        """Process sequences in parallel.

        The producer function must be specified and must
        not return None. Consumer and progress can be set to None.

        Parameters
        ----------
        producer : function
            Function to process data items.
        consumer : queue
            Function to consumed processed data items.
        seq_file : str
            Name of fasta/q file to read.
        progress : function
            Function to report progress string.

        Returns
        -------
        <user specified>
            Set by caller in the consumer function.
        """

        # populate producer queue with data to process
        seq_iter = seq_io.read_seq(seq_file)
        producer_queue = mp.Queue()
        read_all_seqs = False
        for _ in range(self.cpus):
            try:
                seq_data = next(seq_iter)
                producer_queue.put(seq_data)
            except StopIteration:
                read_all_seqs = True
                for _ in range(self.cpus):
                    producer_queue.put(None)  # signal processes to terminate
                break

        data_items = sum(1 for _ in seq_io.read_seq(seq_file))
        try:
            consumer_queue = mp.Queue()
            manager_proc = mp.Process(target=self.__process_manager,
                                      args=(producer, producer_queue,
                                            consumer_queue))

            manager_proc.start()

            # process items produced by workers
            items_processed = 0
            consumer_data = None
            while True:
                if progress:
                    status = progress(items_processed, data_items)
                    sys.stdout.write('%s\r' % status)
                    sys.stdout.flush()

                produced_data = consumer_queue.get(block=True, timeout=None)
                if produced_data == None:
                    break

                if not read_all_seqs:
                    try:
                        seq_data = next(seq_iter)
                        producer_queue.put(seq_data)
                    except StopIteration:
                        read_all_seqs = True
                        for _ in range(self.cpus):
                            producer_queue.put(
                                None)  # signal processes to terminate

                if consumer:
                    consumer_data = consumer(produced_data, consumer_data)

                items_processed += 1

            if progress:
                sys.stdout.write('\n')

            manager_proc.join()

            return consumer_data
        except Exception as _err:
            print(sys.exc_info()[0])
            print(traceback.format_exc())
            self.logger.warning('Exception encountered while processing data.')
            manager_proc.terminate()
예제 #30
0
    def add_compatible_closest(self, scaffold_file, genome_file, compatible_file, out_genome):
        """Add sequences specified as compatible.

        A sequences is added to a bin if and only if it is
        closest to that bin in GC, tetranuclotide, and
        coverage space.

        Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds to add.
        genome_file : str
            Fasta file of binned scaffolds.
        compatible_file : str
            File specifying compatible scaffolds.
        out_genome : str
            Name of output genome.
        """

        cur_bin_id = remove_extension(genome_file)

        # determine statistics for each potentially compatible scaffold
        scaffold_ids = defaultdict(dict)
        with open(compatible_file) as f:
            headers = [x.strip() for x in f.readline().split('\t')]
            scaffold_gc_index = headers.index('Scaffold GC')
            genome_gc_index = headers.index('Mean genome GC')
            td_dist_index = headers.index('Scaffold TD')
            scaffold_cov_index = headers.index('Mean scaffold coverage')
            genome_cov_index = headers.index('Mean genome coverage')

            for line in f:
                line_split = line.split('\t')
                scaffold_id = line_split[0]
                bin_id = line_split[1].strip()

                scaffold_gc = float(line_split[scaffold_gc_index])
                genome_gc = float(line_split[genome_gc_index])
                gc_dist = abs(scaffold_gc - genome_gc)

                td_dist = float(line_split[td_dist_index])

                scaffold_cov = float(line_split[scaffold_cov_index])
                genome_cov = float(line_split[genome_cov_index])
                cov_dist = abs(scaffold_cov - genome_cov)

                scaffold_ids[scaffold_id][bin_id] = [gc_dist, td_dist, cov_dist]

        # determine scaffolds that are closest to a single bin
        # in terms of GC, tetranucleotide distance, and coverage
        compatible_scaffolds = set()
        for scaffold_id, bin_stats in scaffold_ids.iteritems():
            best_gc = [1e9, None]
            best_td = [1e9, None]
            best_cov = [1e9, None]
            for bin_id, stats in bin_stats.iteritems():
                gc, td, cov = stats
                if gc < best_gc[0]:
                    best_gc = [gc, bin_id]
                if td < best_td[0]:
                    best_td = [td, bin_id]
                if cov < best_cov[0]:
                    best_cov = [cov, bin_id]

            # check if scaffold is closest to a single bin
            if (best_gc[1] == best_td[1] == best_cov[1]) and best_gc[1] == cur_bin_id:
                compatible_scaffolds.add(scaffold_id)

        # add compatible sequences to genome
        genome_seqs = seq_io.read(genome_file)
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id in compatible_scaffolds:
                genome_seqs[seq_id] = seq

        # save modified bin
        seq_io.write_fasta(genome_seqs, out_genome)
예제 #31
0
    def run(self, scaffold_gene_file, stat_file, ref_genome_gene_files, db_file, evalue, per_identity,):
        """Create taxonomic profiles for a set of genomes.

        Parameters
        ----------
        scaffold_gene_file : str
            Fasta file of genes on scaffolds in amino acid space.
        stat_file : str
            File with statistics for individual scaffolds.
        ref_genome_gene_files : list of str
            Fasta files of called genes on reference genomes of interest.
        db_file : str
            Database of competing reference genes.
        evalue : float
            E-value threshold used by blast.
        per_identity: float
            Percent identity threshold used by blast.
        """

        # read statistics file
        self.logger.info('')
        self.logger.info('  Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(stat_file)

        # perform homology searches
        self.logger.info('')
        self.logger.info('  Creating diamond database for reference genomes.')
        ref_gene_file = os.path.join(self.output_dir, 'ref_genes.faa')
        concatenate_gene_files(ref_genome_gene_files, ref_gene_file)

        diamond = Diamond(self.cpus)
        ref_diamond_db = os.path.join(self.output_dir, 'ref_genes')
        diamond.make_database(ref_gene_file, ref_diamond_db)

        self.logger.info('  Identifying homologs within reference genomes of interest (be patient!).')
        self.diamond_dir = os.path.join(self.output_dir, 'diamond')
        make_sure_path_exists(self.diamond_dir)
        hits_ref_genomes_daa = os.path.join(self.diamond_dir, 'ref_hits')
        diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, 1, hits_ref_genomes_daa)

        hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv')
        diamond.view(hits_ref_genomes_daa + '.daa', hits_ref_genomes)

        self.logger.info('  Identifying homologs within competing reference genomes (be patient!).')
        hits_comp_ref_genomes_daa = os.path.join(self.diamond_dir, 'competing_ref_hits')
        diamond.blastp(scaffold_gene_file, db_file, evalue, per_identity, 1, hits_comp_ref_genomes_daa)

        hits_comp_ref_genomes = os.path.join(self.diamond_dir, 'competing_ref_hits.tsv')
        diamond.view(hits_comp_ref_genomes_daa + '.daa', hits_comp_ref_genomes)

        # get list of genes with a top hit to the reference genomes of interest
        hits_to_ref = self._top_hits_to_reference(hits_ref_genomes, hits_comp_ref_genomes)

        # get number of genes on each scaffold
        num_genes_on_scaffold = defaultdict(int)
        for seq_id, _seq in seq_io.read_seq(scaffold_gene_file):
            scaffold_id = seq_id[0:seq_id.rfind('_')]
            num_genes_on_scaffold[scaffold_id] += 1

        # get hits to each scaffold
        hits_to_scaffold = defaultdict(list)
        for query_id, hit in hits_to_ref.iteritems():
            gene_id = query_id[0:query_id.rfind('~')]
            scaffold_id = gene_id[0:gene_id.rfind('_')]
            hits_to_scaffold[scaffold_id].append(hit)

        # report summary stats for each scaffold
        reference_out = os.path.join(self.output_dir, 'references.tsv')
        fout = open(reference_out, 'w')
        fout.write('Scaffold id\tSubject scaffold ids\tSubject genome ids')
        fout.write('\tGenome id\tLength (bp)\tGC\tMean coverage')
        fout.write('\t# genes\t# hits\t% genes\tAvg. align. length (bp)\tAvg. % identity\tAvg. e-value\tAvg. bitscore\n')

        for scaffold_id, hits in hits_to_scaffold.iteritems():
            aln_len = []
            perc_iden = []
            evalue = []
            bitscore = []
            subject_scaffold_ids = defaultdict(int)
            subject_bin_ids = defaultdict(int)
            for hit in hits:
                aln_len.append(hit.aln_length)
                perc_iden.append(hit.perc_identity)
                evalue.append(hit.evalue)
                bitscore.append(hit.bitscore)

                subject_id, subject_bin_id = hit.subject_id.split('~')
                subject_scaffold_id = subject_id[0:subject_id.rfind('_')]
                subject_scaffold_ids[subject_scaffold_id] += 1
                subject_bin_ids[subject_bin_id] += 1

            subject_scaffold_id_str = []
            for subject_id, num_hits in subject_scaffold_ids.iteritems():
                subject_scaffold_id_str.append(subject_id + ':' + str(num_hits))
            subject_scaffold_id_str = ','.join(subject_scaffold_id_str)

            subject_bin_id_str = []
            for bin_id, num_hits in subject_bin_ids.iteritems():
                subject_bin_id_str.append(bin_id + ':' + str(num_hits))
            subject_bin_id_str = ','.join(subject_bin_id_str)

            fout.write('%s\t%s\t%s\t%s\t%.2f\t%d\t%d\t%.2f\t%d\t%.2f\t%.2g\t%.2f\n' % (
                                                                        scaffold_id,
                                                                        subject_scaffold_id_str,
                                                                        subject_bin_id_str,
                                                                        scaffold_stats.print_stats(scaffold_id),
                                                                        mean(scaffold_stats.coverage(scaffold_id)),
                                                                        num_genes_on_scaffold[scaffold_id],
                                                                        len(hits),
                                                                        len(hits) * 100.0 / num_genes_on_scaffold[scaffold_id],
                                                                        mean(aln_len),
                                                                        mean(perc_iden),
                                                                        mean(evalue),
                                                                        mean(bitscore)))

        fout.close()

        return reference_out
예제 #32
0
    def _run_reciprocal_diamond(self, query_gene_file, target_gene_file,
                                evalue, per_identity, per_aln_len, max_hits,
                                sensitive, high_mem, tmp_dir, output_dir):
        """Perform similarity search of query genes against target genes, and reciprocal hits.

        Parameters
        ----------
        query_gene_file : str
            File with all query proteins.
        target_gene_file : str
            File with all target proteins.
        evalue : float
            E-value threshold for reporting hits.
        per_identity : float
            Percent identity threshold for reporting hits.
        per_aln_len : float
            Percent query coverage threshold for reporting hits.
        max_hits : int
            Maximum number of hits to report per query sequences.
        tmp_dir : str
            Directory to store temporary files.
        output_dir : str
            Directory to store blast results.
        """

        self.logger.info(
            'Creating DIAMOND database of query proteins (be patient!).')
        diamond = Diamond(self.cpus)
        query_diamond_db = os.path.join(output_dir, 'query_genes')
        diamond.create_db(query_gene_file, query_diamond_db)

        self.logger.info(
            'Creating DIAMOND database of target proteins (be patient!).')
        target_diamond_db = os.path.join(output_dir, 'target_genes')
        diamond.create_db(target_gene_file, target_diamond_db)

        # blast query genes against target proteins
        self.logger.info(
            'Performing similarity sequence between query and target proteins (be patient!).'
        )

        if tmp_dir:
            tmp_query_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_query_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', delete=False)
        tmp_query_hits_table.close()

        query_hits_daa_file = os.path.join(output_dir, 'query_hits')

        if high_mem:
            diamond.blastp(query_gene_file,
                           target_diamond_db,
                           evalue,
                           per_identity,
                           per_aln_len,
                           max_hits,
                           sensitive,
                           tmp_query_hits_table.name,
                           'standard',
                           tmp_dir,
                           chunk_size=1,
                           block_size=8)
        else:
            diamond.blastp(query_gene_file, target_diamond_db, evalue,
                           per_identity, per_aln_len, max_hits, sensitive,
                           tmp_query_hits_table.name, 'standard', tmp_dir)

        # get target genes hit by one or more query proteins
        self.logger.info(
            'Creating file with target proteins with similarity to query proteins.'
        )
        target_hit = set()
        for line in open(tmp_query_hits_table.name):
            line_split = line.split('\t')
            target_hit.add(line_split[1])

        target_genes_hits = os.path.join(output_dir, 'target_genes_hit.faa')
        fout = open(target_genes_hits, 'w')
        for seq_id, seq in seq_io.read_seq(target_gene_file):
            if seq_id in target_hit:
                fout.write('>' + seq_id + '\n')
                fout.write(seq + '\n')
        fout.close()

        self.logger.info(
            'Identified %d target proteins to be used in reciprocal search.' %
            len(target_hit))

        # perform reciprocal blast
        self.logger.info(
            'Performing reciprocal similarity sequence between target and query proteins (be patient!).'
        )

        if tmp_dir:
            tmp_target_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_target_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', delete=False)
        tmp_target_hits_table.close()

        if high_mem:
            diamond.blastp(target_genes_hits,
                           query_diamond_db,
                           evalue,
                           per_identity,
                           per_aln_len,
                           max_hits,
                           sensitive,
                           tmp_target_hits_table.name,
                           'standard',
                           tmp_dir,
                           chunk_size=1,
                           block_size=8)
        else:
            diamond.blastp(target_genes_hits, query_diamond_db, evalue,
                           per_identity, per_aln_len, max_hits, sensitive,
                           tmp_target_hits_table.name, 'standard', tmp_dir)

        # combine hit tables and sort
        os.system('cat %s >> %s' %
                  (tmp_target_hits_table.name, tmp_query_hits_table.name))
        os.remove(tmp_target_hits_table.name)
        hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv')
        self._sort_hit_table(tmp_query_hits_table.name, hits_table_file)
예제 #33
0
    def run(self, genome_files, db_file, taxonomy_file, evalue, per_identity, window_size, step_size):
        """Create taxonomic profiles for a set of genomes.

        Parameters
        ----------
        genome_files : list of str
            Fasta files of genomes to process.
        db_file : str
            Database of reference genes.
        taxonomy_file : str
            File containing GreenGenes taxonomy strings for reference genomes.
        evalue : float
            E-value threshold used by blast.
        per_identity: float
            Percent identity threshold used by blast.
        window_size : int
            Size of each fragment.
        step_size : int
            Number of bases to move after each window.
        """

        # parse taxonomy file
        self.logger.info('  Reading taxonomic assignment of reference genomes.')
        taxonomy = Taxonomy().read(taxonomy_file)

        # fragment each genome into fixed sizes windows
        self.logger.info('')
        self.logger.info('  Fragmenting sequences in each bin:')
        diamond_output_dir = os.path.join(self.output_dir, 'diamond')
        make_sure_path_exists(diamond_output_dir)

        fragment_file = os.path.join(diamond_output_dir, 'fragments.fna')
        fragment_out = open(fragment_file, 'w')
        contig_id_to_genome_id = {}
        for genome_file in genome_files:
            genome_id = remove_extension(genome_file)
            self.profiles[genome_id] = Profile(genome_id, taxonomy)
            self._fragment_genomes(genome_file,
                                  window_size,
                                  step_size,
                                  self.profiles[genome_id],
                                  fragment_out)

            for seq_id, _seq in seq_io.read_seq(genome_file):
                contig_id_to_genome_id[seq_id] = genome_id

        # run diamond
        self.logger.info('')
        self.logger.info('  Running diamond blastx with %d processes (be patient!)' % self.cpus)

        diamond = Diamond(self.cpus)
        diamond_daa_out = os.path.join(diamond_output_dir, 'diamond_hits')
        diamond.blastx(fragment_file, db_file, evalue, per_identity, 1, diamond_daa_out)

        diamond_table_out = os.path.join(diamond_output_dir, 'diamond_hits.tsv')
        diamond.view(diamond_daa_out + '.daa', diamond_table_out)

        self.logger.info('')
        self.logger.info('  Creating taxonomic profile for each genome.')
        self._taxonomic_profiles(diamond_table_out, taxonomy, contig_id_to_genome_id)

        self.logger.info('')
        self.logger.info('  Writing taxonomic profile for each genome.')

        report_dir = os.path.join(self.output_dir, 'bin_reports')
        make_sure_path_exists(report_dir)

        for genome_id, profile in self.profiles.iteritems():
            seq_summary_out = os.path.join(report_dir, genome_id + '.sequences.tsv')
            profile.write_seq_summary(seq_summary_out)

            genome_profile_out = os.path.join(report_dir, genome_id + '.profile.tsv')
            profile.write_genome_profile(genome_profile_out)

        genome_summary_out = os.path.join(self.output_dir, 'genome_summary.tsv')
        self._write_genome_summary(genome_summary_out)

        # create Krona plot
        krona_profiles = defaultdict(lambda: defaultdict(int))
        for genome_id, profile in self.profiles.iteritems():
            seq_assignments = profile.classify_seqs(taxonomy)

            for seq_id, classification in seq_assignments.iteritems():
                taxa = []
                for r in xrange(0, len(profile.rank_labels)):
                    taxa.append(classification[r][0])

                krona_profiles[genome_id][';'.join(taxa)] += profile.seq_len[seq_id]

        krona = Krona()
        krona_output_file = os.path.join(self.output_dir, 'taxonomic_profiles.krona.html')
        krona.create(krona_profiles, krona_output_file)
예제 #34
0
    def run(self, genome_files1, genome_files2, seq_file, output_file):
        """Get basic statistics about genomes.

        Parameters
        ----------
        genome_files1 : iterable
            First set of henome files in fasta format.
        genome_files2 : iterable
            Second set of genome files in fasta format.
        seq_file : str
            Scaffolds/contigs binned to create genomes.
        output_file : str
            Desire file to write results.
        """

        # determine total number of sequences
        self.logger.info('Reading sequences.')

        seq_lens = {}
        total_bases = 0
        num_seqs_over_length = defaultdict(int)
        total_bases_over_length = defaultdict(int)
        lengths_to_check = [1000, 5000, 10000, 20000, 50000]
        for seq_id, seq in seq_io.read_seq(seq_file):
            seq_len = len(seq)
            seq_lens[seq_id] = seq_len
            total_bases += seq_len

            for length in lengths_to_check:
                if seq_len >= length:
                    num_seqs_over_length[length] += 1
                    total_bases_over_length[length] += seq_len

        # determine sequences in each bin
        genome_seqs1 = self._genome_seqs(genome_files1)
        genome_seqs2 = self._genome_seqs(genome_files2)

        # determine bin stats
        genome_stats1, total_uniq_binned_seqs1, total_uniq_binned_bases1, num_repeats1 = self._genome_stats(genome_seqs1, seq_lens)
        genome_stats2, total_uniq_binned_seqs2, total_uniq_binned_bases2, num_repeats2 = self._genome_stats(genome_seqs2, seq_lens)

        # sort bins by size
        genome_stats1 = sorted(genome_stats1.iteritems(), key=lambda x: x[1][1], reverse=True)
        genome_stats2 = sorted(genome_stats2.iteritems(), key=lambda x: x[1][1], reverse=True)

        # report summary results
        self.reporter.info('Total seqs = %d (%.2f Mbp)' % (len(seq_lens), float(total_bases) / 1e6))
        for length in lengths_to_check:
            self.reporter.info('  # seqs > %d kbp = %d (%.2f Mbp)' % (int(length / 1000),
                                                                        num_seqs_over_length[length],
                                                                        float(total_bases_over_length[length]) / 1e6))

        self.reporter.info('')
        self.reporter.info('Binned seqs statistics:')
        self.reporter.info('  1) # genomes: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%), # seqs in multiple bins: %d'
                                % (len(genome_seqs1),
                                   total_uniq_binned_seqs2,
                                   float(total_uniq_binned_seqs1) * 100 / len(seq_lens),
                                   float(total_uniq_binned_bases1) / 1e6,
                                   float(total_uniq_binned_bases1) * 100 / total_bases,
                                   num_repeats1))
        self.reporter.info('  2) # genomes: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%), # seqs in multiple bins: %d'
                                % (len(genome_seqs2),
                                   total_uniq_binned_seqs2,
                                   float(total_uniq_binned_seqs2) * 100 / len(seq_lens),
                                   float(total_uniq_binned_bases2) / 1e6,
                                   float(total_uniq_binned_bases2) * 100 / total_bases,
                                   num_repeats2))

        # output report
        fout = open(output_file, 'w')
        for data in genome_stats2:
            fout.write('\t' + data[0])
        fout.write('\tunbinned\t# seqs\t# bases (Mbp)\tBest match\t% bases in common\t% seqs in common\n')

        max_bp_common2 = defaultdict(int)
        max_seqs_common2 = defaultdict(int)
        best_matching_genome2 = {}
        binned_seqs2 = defaultdict(set)
        for data1 in genome_stats1:
            bin_id1 = data1[0]
            fout.write(bin_id1)

            seqs1 = genome_seqs1[bin_id1]

            max_bp_common = 0
            max_seqs_common = 0
            best_matching_genome = 'n/a'
            binned_seqs = set()
            for data2 in genome_stats2:
                bin_id2 = data2[0]
                seqs2 = genome_seqs2[bin_id2]

                seqs_common = seqs1.intersection(seqs2)
                binned_seqs.update(seqs_common)
                num_seqs_common = len(seqs_common)
                fout.write('\t' + str(num_seqs_common))

                bases_common = 0
                for seqId in seqs_common:
                    bases_common += seq_lens[seqId]

                if bases_common > max_bp_common:
                    max_bp_common = bases_common
                    max_seqs_common = num_seqs_common
                    best_matching_genome = bin_id2

                if bases_common > max_bp_common2[bin_id2]:
                    max_bp_common2[bin_id2] = bases_common
                    max_seqs_common2[bin_id2] = num_seqs_common
                    best_matching_genome2[bin_id2] = bin_id1

                binned_seqs2[bin_id2].update(seqs_common)
            fout.write('\t%d\t%d\t%.2f\t%s\t%.2f\t%.2f\n' % (len(seqs1) - len(binned_seqs),
                                                             data1[1][0],
                                                             float(data1[1][1]) / 1e6,
                                                             best_matching_genome,
                                                             float(max_bp_common) * 100 / data1[1][1],
                                                             float(max_seqs_common) * 100 / data1[1][0],
                                                             ))

        fout.write('unbinned')
        for data in genome_stats2:
            genome_id = data[0]
            fout.write('\t%d' % (len(genome_seqs2[genome_id]) - len(binned_seqs2[genome_id])))
        fout.write('\n')

        fout.write('# seqs')
        for data in genome_stats2:
            fout.write('\t%d' % data[1][0])
        fout.write('\n')

        fout.write('# bases (Mbp)')
        for data in genome_stats2:
            fout.write('\t%.2f' % (float(data[1][1]) / 1e6))
        fout.write('\n')

        fout.write('Best match')
        for data in genome_stats2:
            binId = data[0]
            fout.write('\t%s' % best_matching_genome2.get(binId, 'n/a'))
        fout.write('\n')

        fout.write('% bases in common')
        for data in genome_stats2:
            binId = data[0]
            fout.write('\t%.2f' % (float(max_bp_common2[binId]) * 100 / data[1][1]))
        fout.write('\n')

        fout.write('% seqs in common')
        for data in genome_stats2:
            binId = data[0]
            fout.write('\t%.2f' % (float(max_seqs_common2[binId]) * 100 / data[1][0]))
        fout.write('\n')

        fout.close()
예제 #35
0
    def run(self, msa_file, tree_program, prot_model, skip_rooting,
            output_dir):
        """Infer tree.

        Parameters
        ----------
        msa_file : str
          Multiple sequence alignment in fasta format.
        tree_program : str
          Program to use for tree inference ['fasttree', 'raxml'].
        prot_model : str
          Protein substitution model for tree inference ['WAG', 'LG', 'AUTO'].
        output_dir : str
          Directory to store results.
        """

        num_seqs = sum([1 for _, _ in seq_io.read_seq(msa_file)])
        if num_seqs <= 2:
            self.logger.error(
                'Insufficient number of sequences in MSA to infer tree.')
            raise SystemExit('Tree inference failed.')

        output_file = ntpath.basename(msa_file)
        prefix = output_file[0:output_file.rfind('.')]
        suffix = output_file[output_file.rfind('.') + 1:]

        if tree_program == 'fasttree':
            self.logger.info(
                'Inferring gene tree with FastTree using %s+GAMMA.' %
                prot_model)
            fasttree = FastTree(multithreaded=(self.cpus > 1))

            tree_unrooted_output = os.path.join(output_dir,
                                                prefix + '.unrooted.tree')
            tree_log = os.path.join(output_dir, prefix + '.tree.log')
            tree_output_log = os.path.join(output_dir, 'fasttree.log')
            fasttree.run(msa_file, 'prot', prot_model, tree_unrooted_output,
                         tree_log, tree_output_log)
        elif tree_program == 'raxml':
            self.logger.info(
                'Inferring gene tree with RAxML using PROTGAMMA%s.' %
                prot_model)

            # create phylip MSA file
            phylip_msa_file = msa_file.replace('.faa', '.phyx')
            cmd = 'seqmagick convert %s %s' % (msa_file, phylip_msa_file)
            os.system(cmd)

            # run RAxML
            raxml_dir = os.path.abspath(os.path.join(output_dir, 'raxml'))
            tree_output_log = os.path.join(output_dir, 'raxml.log')

            raxml = RAxML(self.cpus)
            tree_unrooted_output = raxml.run(phylip_msa_file, prot_model,
                                             raxml_dir)

        # root tree at midpoint
        if not skip_rooting:
            seqs = seq_io.read(msa_file)
            if len(seqs) > 2:
                self.logger.info('Rooting tree at midpoint.')
                tree = dendropy.Tree.get_from_path(tree_unrooted_output,
                                                   schema='newick',
                                                   rooting="force-rooted",
                                                   preserve_underscores=True)
                tree.reroot_at_midpoint(update_bipartitions=False)

            tree_output = os.path.join(output_dir, prefix + '.rooted.tree')
            tree.write_to_path(tree_output,
                               schema='newick',
                               suppress_rooting=True,
                               unquoted_underscores=True)
        else:
            tree_output = tree_unrooted_output

        return tree_output
예제 #36
0
    def run(self, scaffold_file, genome_files, tetra_file, coverage_file,
            output_file):
        """Calculate statistics for scaffolds.

        Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds.
        genome_files : list of str
            Fasta files with binned scaffolds.
        tetra_file : str
            Tetranucleotide signatures for scaffolds.
        coverage_file : str
            Coverage profiles for scaffolds
        output_file : str
            Output file for scaffolds statistics.
        """

        tetra = Tetranucleotide(self.cpus)
        signatures = tetra.read(tetra_file)

        cov_profiles = None
        if coverage_file:
            coverage = Coverage(self.cpus)
            cov_profiles, _ = coverage.read(coverage_file)

        # determine bin assignment for each scaffold
        self.logger.info('Determining scaffold statistics.')

        scaffold_id_genome_id = {}
        for gf in genome_files:
            genome_id = remove_extension(gf)
            for scaffold_id, _seq in seq_io.read_seq(gf):
                scaffold_id_genome_id[scaffold_id] = genome_id

        # write out scaffold statistics
        fout = open(output_file, 'w')
        fout.write('Scaffold id\tGenome Id\tGC\tLength (bp)')

        if cov_profiles:
            first_key = list(cov_profiles.keys())[0]
            bam_ids = sorted(cov_profiles[first_key].keys())
            for bam_id in bam_ids:
                fout.write('\t' + bam_id)

        for kmer in tetra.canonical_order():
            fout.write('\t' + kmer)
        fout.write('\n')

        for scaffold_id, seq in seq_io.read_seq(scaffold_file):
            fout.write(scaffold_id)
            fout.write('\t' +
                       scaffold_id_genome_id.get(scaffold_id, self.unbinned))
            fout.write('\t%.2f' % (seq_tk.gc(seq) * 100.0))
            fout.write('\t%d' % len(seq))

            if cov_profiles:
                for bam_id in bam_ids:
                    fout.write('\t%.2f' % cov_profiles[scaffold_id][bam_id])

            fout.write('\t' + '\t'.join(map(str, signatures[scaffold_id])))
            fout.write('\n')

        fout.close()
예제 #37
0
    def _run_reciprocal_diamond(self, query_gene_file,
                                        target_gene_file,
                                        evalue, 
                                        per_identity, 
                                        per_aln_len,
                                        max_hits,
                                        sensitive,
                                        high_mem,
                                        tmp_dir,
                                        output_dir):
        """Perform similarity search of query genes against target genes, and reciprocal hits.

        Parameters
        ----------
        query_gene_file : str
            File with all query proteins.
        target_gene_file : str
            File with all target proteins.
        evalue : float
            E-value threshold for reporting hits.
        per_identity : float
            Percent identity threshold for reporting hits.
        per_aln_len : float
            Percent query coverage threshold for reporting hits.
        max_hits : int
            Maximum number of hits to report per query sequences.
        tmp_dir : str
            Directory to store temporary files.
        output_dir : str
            Directory to store blast results.
        """
        
        self.logger.info('Creating DIAMOND database of query proteins (be patient!).')
        diamond = Diamond(self.cpus)
        query_diamond_db = os.path.join(output_dir, 'query_genes')
        diamond.make_database(query_gene_file, query_diamond_db)
        
        self.logger.info('Creating DIAMOND database of target proteins (be patient!).')
        target_diamond_db = os.path.join(output_dir, 'target_genes')
        diamond.make_database(target_gene_file, target_diamond_db)

        # blast query genes against target proteins
        self.logger.info('Performing similarity sequence between query and target proteins (be patient!).')
        
        if tmp_dir:
            tmp_query_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_query_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', delete=False)
        tmp_query_hits_table.close()
        
        query_hits_daa_file = os.path.join(output_dir, 'query_hits')
        
        if high_mem:
            diamond.blastp(query_gene_file, 
                            target_diamond_db, 
                            evalue, 
                            per_identity, 
                            per_aln_len, 
                            max_hits,
                            sensitive,
                            tmp_query_hits_table.name, 
                            'standard', 
                            tmp_dir, 
                            chunk_size=1, 
                            block_size=8)
        else:
            diamond.blastp(query_gene_file, 
                            target_diamond_db, 
                            evalue, 
                            per_identity, 
                            per_aln_len, 
                            max_hits,
                            sensitive,
                            tmp_query_hits_table.name, 
                            'standard', 
                            tmp_dir)
                
        # get target genes hit by one or more query proteins
        self.logger.info('Creating file with target proteins with similarity to query proteins.')
        target_hit = set()
        for line in open(tmp_query_hits_table.name):
            line_split = line.split('\t')
            target_hit.add(line_split[1])

        target_genes_hits = os.path.join(output_dir, 'target_genes_hit.faa')
        fout = open(target_genes_hits, 'w')
        for seq_id, seq in seq_io.read_seq(target_gene_file):
            if seq_id in target_hit:
                fout.write('>' + seq_id + '\n')
                fout.write(seq + '\n')
        fout.close()
        
        self.logger.info('Identified %d target proteins to be used in reciprocal search.' % len(target_hit))
        
        # perform reciprocal blast
        self.logger.info('Performing reciprocal similarity sequence between target and query proteins (be patient!).')

        if tmp_dir:
            tmp_target_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_target_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', delete=False)
        tmp_target_hits_table.close()
        
        if high_mem:
            diamond.blastp(target_genes_hits, 
                            query_diamond_db, 
                            evalue, 
                            per_identity, 
                            per_aln_len, 
                            max_hits,
                            sensitive,
                            tmp_target_hits_table.name, 
                            'standard', 
                            tmp_dir, 
                            chunk_size=1, 
                            block_size=8)
        else:
            diamond.blastp(target_genes_hits, 
                            query_diamond_db, 
                            evalue, 
                            per_identity, 
                            per_aln_len, 
                            max_hits,
                            sensitive,
                            tmp_target_hits_table.name, 
                            'standard', 
                            tmp_dir)
                
        # combine hit tables and sort
        os.system('cat %s >> %s' % (tmp_target_hits_table.name, tmp_query_hits_table.name))
        os.remove(tmp_target_hits_table.name)
        hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv')
        self._sort_hit_table(tmp_query_hits_table.name, hits_table_file)
예제 #38
0
    def run(self, gene_dirs, min_per_gene, min_per_bps, tree_program,
            prot_model, split_chars, output_dir):
        """Infer concatenated gene tree.

        Parameters
        ----------
        gene_dirs : list
            GeneTreeTk output directories with information for individual genes.
        min_per_gene : float
            Minimum percentage of genes required to retain taxa.
        min_per_bps : float
            Minimum percentage of base pairs required to retain taxa.
        tree_program : str
            Program to use for tree inference ['fasttree', 'raxml'].
        prot_model : str
            Protein substitution model for tree inference ['WAG', 'LG', 'AUTO'].
        output_dir : str
            Directory to store results.
        """

        # read MSA files
        concat = defaultdict(lambda: defaultdict(list))
        msa_length = 0
        gene_lengths = {}
        for gene_dir in gene_dirs:
            homologs = os.path.join(gene_dir, 'homologs.trimmed.aligned.faa')

            for seq_id, seq in seq_io.read_seq(homologs):
                taxon_id, gene_id = self._split_ids(seq_id, split_chars)
                if not taxon_id:
                    self.logger.error('Failed to split identifier: %s' %
                                      seq_id)
                    sys.exit(-1)

                concat[taxon_id][gene_dir].append(seq)

            msa_length += len(seq)
            gene_lengths[gene_dir] = len(seq)

        # filter taxon
        mc_filter = set()
        min_per_gene_filter = set()
        min_per_bps_filter = set()
        for taxon_id in concat:
            # check if multiple copy
            missing = 0
            taxon_msa_len = 0
            for gene_id in gene_dirs:
                if gene_id not in concat[taxon_id]:
                    missing += 1
                    continue

                if len(concat[taxon_id][gene_id]) > 1:
                    mc_filter.add(taxon_id)
                    break

                taxon_msa_len += len(concat[taxon_id][gene_id][0])

            if taxon_id not in mc_filter:
                if missing > len(gene_dirs) * (1.0 -
                                               float(min_per_gene) / 100.0):
                    min_per_gene_filter.add(taxon_id)
                elif taxon_msa_len < msa_length * float(min_per_bps) / 100.0:
                    min_per_bps_filter.add(taxon_id)

        min_req_genes = math.ceil(len(gene_dirs) * float(min_per_gene) / 100.0)

        filtered_taxa = mc_filter.union(min_per_gene_filter).union(
            min_per_bps_filter)
        remaining_taxa = set(concat) - filtered_taxa
        self.logger.info('No. genes: %d' % len(gene_dirs))
        self.logger.info('No. taxa across all genes: %d' % len(concat))
        self.logger.info('Total filtered taxa: %d' % len(filtered_taxa))
        self.logger.info('  Due to multi-copy genes: %d' % len(mc_filter))
        self.logger.info('  Due to having <%d of the genes: %d' %
                         (min_req_genes, len(min_per_gene_filter)))
        self.logger.info('  Due to an insufficient number of base pairs: %d' %
                         len(min_per_bps_filter))
        self.logger.info('Remaining taxa: %d' % len(remaining_taxa))
        self.logger.info('Length of concatenated MSA: %d' % msa_length)

        # create the multiple sequences alignment
        msa_file = os.path.join(output_dir, 'concatenated.faa')
        fout = open(msa_file, 'w')
        for taxon_id in remaining_taxa:
            msa = ''
            for gene_id in gene_dirs:
                if gene_id not in concat[taxon_id]:
                    msa += '-' * gene_lengths[gene_id]
                else:
                    msa += concat[taxon_id][gene_id][0]

            fout.write('>%s\n' % taxon_id)
            fout.write('%s\n' % msa)
        fout.close()

        # read all taxonomy files
        # (assumes taxonomy is the same for taxa across all genes)
        taxonomy = {}
        for gene_id in gene_dirs:
            taxonomy_file = os.path.join(gene_id, 'taxonomy.tsv')
            t = Taxonomy().read(taxonomy_file)
            for label, taxa_str in t.iteritems():
                taxon_id, gene_id = self._split_ids(label, split_chars)
                taxonomy[taxon_id] = taxa_str

        # create taxonomy file for retained taxa
        self.logger.info('Creating taxonomy file for retained taxa.')
        output_taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv')
        fout = open(output_taxonomy_file, 'w')
        for taxon_id in remaining_taxa:
            if taxon_id in taxonomy:  # query genomes will generally be missing
                fout.write('%s\t%s\n' %
                           (taxon_id, ';'.join(taxonomy[taxon_id])))
        fout.close()

        # infer tree
        if tree_program == 'fasttree':
            self.logger.info(
                'Inferring gene tree with FastTree using %s+GAMMA.' %
                prot_model)
            fasttree = FastTree(multithreaded=(self.cpus > 1))

            tree_unrooted_output = os.path.join(output_dir,
                                                'concatenated.unrooted.tree')
            tree_log = os.path.join(output_dir, 'concatenated.tree.log')
            tree_output_log = os.path.join(output_dir, 'fasttree.log')
            fasttree.run(msa_file, 'prot', prot_model, tree_unrooted_output,
                         tree_log, tree_output_log)
        elif tree_program == 'raxml':
            self.logger.info(
                'Inferring gene tree with RAxML using PROTGAMMA%s.' %
                prot_model)

            # create phylip MSA file
            phylip_msa_file = msa_file.replace('.faa', '.phyx')
            cmd = 'seqmagick convert %s %s' % (msa_file, phylip_msa_file)
            os.system(cmd)

            # run RAxML
            raxml_dir = os.path.abspath(os.path.join(output_dir, 'raxml'))
            tree_output_log = os.path.join(output_dir, 'raxml.log')

            raxml = RAxML(self.cpus)
            tree_unrooted_output = raxml.run(phylip_msa_file, prot_model,
                                             raxml_dir)

        # root tree at midpoint
        self.logger.info('Rooting tree at midpoint.')
        tree = dendropy.Tree.get_from_path(tree_unrooted_output,
                                           schema='newick',
                                           rooting="force-rooted",
                                           preserve_underscores=True)
        if len(remaining_taxa) > 2:
            tree.reroot_at_midpoint(update_bipartitions=False)
        tree_output = os.path.join(output_dir, 'concatenated.rooted.tree')
        tree.write_to_path(tree_output,
                           schema='newick',
                           suppress_rooting=True,
                           unquoted_underscores=True)

        # create tax2tree consensus map and decorate tree
        t2t_tree = os.path.join(output_dir, 'concatenated.tax2tree.tree')
        cmd = 't2t decorate -m %s -t %s -o %s' % (output_taxonomy_file,
                                                  tree_output, t2t_tree)
        os.system(cmd)

        # setup metadata for ARB file
        src_dir = os.path.dirname(os.path.realpath(__file__))
        version_file = open(os.path.join(src_dir, 'VERSION'))

        metadata = {}
        metadata['genetreetk_version'] = version_file.read().strip()

        metadata['genetreetk_tree_program'] = tree_program
        metadata['genetreetk_tree_prot_model'] = prot_model

        # create ARB metadata file
        self.logger.info('Creating ARB metadata file.')
        arb_metadata_file = os.path.join(output_dir, 'arb.metadata.txt')
        self.create_arb_metadata(msa_file, taxonomy, metadata,
                                 arb_metadata_file)
예제 #39
0
    def run(self, genome_files1, genome_files2, seq_file, output_file):
        """Get basic statistics about genomes.

        Parameters
        ----------
        genome_files1 : iterable
            First set of henome files in fasta format.
        genome_files2 : iterable
            Second set of genome files in fasta format.
        seq_file : str
            Scaffolds/contigs binned to create genomes.
        output_file : str
            Desire file to write results.
        """

        # determine total number of sequences
        self.logger.info('')
        self.logger.info('  Reading sequences.')

        seq_lens = {}
        total_bases = 0
        num_seqs_over_length = defaultdict(int)
        total_bases_over_length = defaultdict(int)
        lengths_to_check = [1000, 5000, 10000, 20000, 50000]
        for seq_id, seq in seq_io.read_seq(seq_file):
            seq_len = len(seq)
            seq_lens[seq_id] = seq_len
            total_bases += seq_len

            for length in lengths_to_check:
                if seq_len >= length:
                    num_seqs_over_length[length] += 1
                    total_bases_over_length[length] += seq_len

        # determine sequences in each bin
        genome_seqs1 = self._genome_seqs(genome_files1)
        genome_seqs2 = self._genome_seqs(genome_files2)

        # determine bin stats
        genome_stats1, total_uniq_binned_seqs1, total_uniq_binned_bases1, num_repeats1 = self._genome_stats(genome_seqs1, seq_lens)
        genome_stats2, total_uniq_binned_seqs2, total_uniq_binned_bases2, num_repeats2 = self._genome_stats(genome_seqs2, seq_lens)

        # sort bins by size
        genome_stats1 = sorted(genome_stats1.iteritems(), key=lambda x: x[1][1], reverse=True)
        genome_stats2 = sorted(genome_stats2.iteritems(), key=lambda x: x[1][1], reverse=True)

        # report summary results
        self.logger.info('    Total seqs = %d (%.2f Mbp)' % (len(seq_lens), float(total_bases) / 1e6))
        for length in lengths_to_check:
            self.logger.info('      # seqs > %d kbp = %d (%.2f Mbp)' % (int(length / 1000),
                                                                        num_seqs_over_length[length],
                                                                        float(total_bases_over_length[length]) / 1e6))

        self.logger.info('')
        self.logger.info('  Binned seqs statistics:')
        self.logger.info('    1) # genomes: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%), # seqs in multiple bins: %d'
                                % (len(genome_seqs1),
                                   total_uniq_binned_seqs2,
                                   float(total_uniq_binned_seqs1) * 100 / len(seq_lens),
                                   float(total_uniq_binned_bases1) / 1e6,
                                   float(total_uniq_binned_bases1) * 100 / total_bases,
                                   num_repeats1))
        self.logger.info('    2) # genomes: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%), # seqs in multiple bins: %d'
                                % (len(genome_seqs2),
                                   total_uniq_binned_seqs2,
                                   float(total_uniq_binned_seqs2) * 100 / len(seq_lens),
                                   float(total_uniq_binned_bases2) / 1e6,
                                   float(total_uniq_binned_bases2) * 100 / total_bases,
                                   num_repeats2))

        # output report
        fout = open(output_file, 'w')
        for data in genome_stats2:
            fout.write('\t' + data[0])
        fout.write('\tunbinned\t# seqs\t# bases (Mbp)\tBest match\t% bases in common\t% seqs in common\n')

        max_bp_common2 = defaultdict(int)
        max_seqs_common2 = defaultdict(int)
        best_matching_genome2 = {}
        binned_seqs2 = defaultdict(set)
        for data1 in genome_stats1:
            bin_id1 = data1[0]
            fout.write(bin_id1)

            seqs1 = genome_seqs1[bin_id1]

            max_bp_common = 0
            max_seqs_common = 0
            best_matching_genome = 'n/a'
            binned_seqs = set()
            for data2 in genome_stats2:
                bin_id2 = data2[0]
                seqs2 = genome_seqs2[bin_id2]

                seqs_common = seqs1.intersection(seqs2)
                binned_seqs.update(seqs_common)
                num_seqs_common = len(seqs_common)
                fout.write('\t' + str(num_seqs_common))

                bases_common = 0
                for seqId in seqs_common:
                    bases_common += seq_lens[seqId]

                if bases_common > max_bp_common:
                    max_bp_common = bases_common
                    max_seqs_common = num_seqs_common
                    best_matching_genome = bin_id2

                if bases_common > max_bp_common2[bin_id2]:
                    max_bp_common2[bin_id2] = bases_common
                    max_seqs_common2[bin_id2] = num_seqs_common
                    best_matching_genome2[bin_id2] = bin_id1

                binned_seqs2[bin_id2].update(seqs_common)
            fout.write('\t%d\t%d\t%.2f\t%s\t%.2f\t%.2f\n' % (len(seqs1) - len(binned_seqs),
                                                             data1[1][0],
                                                             float(data1[1][1]) / 1e6,
                                                             best_matching_genome,
                                                             float(max_bp_common) * 100 / data1[1][1],
                                                             float(max_seqs_common) * 100 / data1[1][0],
                                                             ))

        fout.write('unbinned')
        for data in genome_stats2:
            genome_id = data[0]
            fout.write('\t%d' % (len(genome_seqs2[genome_id]) - len(binned_seqs2[genome_id])))
        fout.write('\n')

        fout.write('# seqs')
        for data in genome_stats2:
            fout.write('\t%d' % data[1][0])
        fout.write('\n')

        fout.write('# bases (Mbp)')
        for data in genome_stats2:
            fout.write('\t%.2f' % (float(data[1][1]) / 1e6))
        fout.write('\n')

        fout.write('Best match')
        for data in genome_stats2:
            binId = data[0]
            fout.write('\t%s' % best_matching_genome2.get(binId, 'n/a'))
        fout.write('\n')

        fout.write('% bases in common')
        for data in genome_stats2:
            binId = data[0]
            fout.write('\t%.2f' % (float(max_bp_common2[binId]) * 100 / data[1][1]))
        fout.write('\n')

        fout.write('% seqs in common')
        for data in genome_stats2:
            binId = data[0]
            fout.write('\t%.2f' % (float(max_seqs_common2[binId]) * 100 / data[1][0]))
        fout.write('\n')

        fout.close()
예제 #40
0
    def create_arb_metadata(self, homologs, msa_output, taxonomy, metadata,
                            gene_precontext, gene_postcontext, output_file):
        """Create metadata file suitable for import into ARB.

        Parameters
        ----------
        homologs : d[seq_id] -> namedtuple of BlastHit information
            BLAST results for identified homologs.
        msa_output : str
            Fasta file with aligned homologs.
        taxonomy : d[genome_id] -> list of taxa
            Taxonomic information for genomes.
        metadata : d[key] - string
            Additional metadata to write to ARB file.
        gene_precontext : d[seq_id] -> list of annotations for pre-context genes
            Annotation for genes preceding a gene.
        gene_postcontext: d[seq_id] -> list of annotations for post-context genes
            Annotation for genes following a gene.
        output_file : str
            File to write metadata information.
        """

        arb_metadata_list = []
        for seq_id, seq, annotation in seq_io.read_seq(msa_output,
                                                       keep_annotation=True):
            if '~' in seq_id:
                genome_id, scaffold_gene_id = seq_id.split('~')
            else:
                scaffold_gene_id = seq_id
                genome_id = ''

            arb_metadata = {}
            arb_metadata['db_name'] = seq_id
            arb_metadata['genome_id'] = genome_id
            arb_metadata['scaffold_id'] = scaffold_gene_id[0:scaffold_gene_id.
                                                           rfind('_')]
            arb_metadata['scaffold_gene_id'] = scaffold_gene_id
            arb_metadata['gtdb_tax_string'] = ';'.join(
                taxonomy.get(genome_id, ''))
            arb_metadata['aligned_seq'] = seq

            for k, v in metadata.iteritems():
                arb_metadata[k] = v

            arb_metadata['gene_precontext'] = ' -> '.join(
                gene_precontext.get(seq_id, []))
            arb_metadata['gene_postcontext'] = ' <- '.join(
                gene_postcontext.get(seq_id, []))

            hit_info = homologs.get(seq_id, None)
            if hit_info:
                arb_metadata['blast_evalue'] = '%.1g' % hit_info.evalue
                arb_metadata['blast_bitscore'] = '%.1f' % hit_info.bitscore
                arb_metadata[
                    'blast_perc_identity'] = '%.1f' % hit_info.perc_identity
                arb_metadata[
                    'blast_subject_perc_alignment_len'] = '%.1f' % hit_info.subject_perc_aln_len
                arb_metadata[
                    'blast_query_perc_alignment_len'] = '%.1f' % hit_info.query_perc_aln_len
                arb_metadata['blast_query_id'] = hit_info.query_id

            if annotation:
                annotation_split = annotation.split('] [')
                if len(annotation_split) == 3:
                    # assume format is [gtdb_taxonomy] [NCBI organism name] [annotation]
                    gtdb_taxonomy, organism_name, gene_annotation = annotation_split
                    gtdb_taxonomy = gtdb_taxonomy.replace('[', '')
                    gene_annotation = gene_annotation.replace(']', '')
                else:
                    # no idea what the format is, so just save the annotation
                    gene_annotation = annotation
                    organism_name = ''
                    gtdb_taxonomy = ''

                arb_metadata['gene_annotation'] = gene_annotation
                arb_metadata['organism'] = organism_name
                arb_metadata['full_name'] = organism_name

            arb_metadata_list.append(arb_metadata)

        fout = open(output_file, 'w')
        arb_parser = ArbParser()
        arb_parser.write(arb_metadata_list, fout)
        fout.close()
예제 #41
0
    def run(self, scaffold_gene_file, stat_file, ref_genome_gene_files, db_file, evalue, per_identity, per_aln_len):
        """Create taxonomic profiles for a set of genomes.

        Parameters
        ----------
        scaffold_gene_file : str
            Fasta file of genes on scaffolds in amino acid space.
        stat_file : str
            File with statistics for individual scaffolds.
        ref_genome_gene_files : list of str
            Fasta files of called genes on reference genomes of interest.
        db_file : str
            Database of competing reference genes.
        evalue : float
            E-value threshold of valid hits.
        per_identity : float
            Percent identity threshold of valid hits [0,100].
        per_aln_len : float
            Percent query coverage of valid hits [0, 100].
        """

        # read statistics file
        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(stat_file)

        # perform homology searches
        self.logger.info('Creating DIAMOND database for reference genomes.')
        ref_gene_file = os.path.join(self.output_dir, 'ref_genes.faa')
        concatenate_gene_files(ref_genome_gene_files, ref_gene_file)

        diamond = Diamond(self.cpus)
        ref_diamond_db = os.path.join(self.output_dir, 'ref_genes')
        diamond.create_db(ref_gene_file, ref_diamond_db)

        self.logger.info('Identifying homologs within reference genomes of interest (be patient!).')
        self.diamond_dir = os.path.join(self.output_dir, 'diamond')
        make_sure_path_exists(self.diamond_dir)
        hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv')
        diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, per_aln_len, 1, False, hits_ref_genomes)

        self.logger.info('Identifying homologs within competing reference genomes (be patient!).')
        hits_comp_ref_genomes = os.path.join(self.diamond_dir, 'competing_ref_hits.tsv')
        diamond.blastp(scaffold_gene_file, db_file, evalue, per_identity, per_aln_len, 1, False, hits_comp_ref_genomes)

        # get list of genes with a top hit to the reference genomes of interest
        hits_to_ref = self._top_hits_to_reference(hits_ref_genomes, hits_comp_ref_genomes)

        # get number of genes on each scaffold
        num_genes_on_scaffold = defaultdict(int)
        for seq_id, _seq in seq_io.read_seq(scaffold_gene_file):
            scaffold_id = seq_id[0:seq_id.rfind('_')]
            num_genes_on_scaffold[scaffold_id] += 1

        # get hits to each scaffold
        hits_to_scaffold = defaultdict(list)
        for query_id, hit in hits_to_ref.items():
            gene_id = query_id[0:query_id.rfind('~')]
            scaffold_id = gene_id[0:gene_id.rfind('_')]
            hits_to_scaffold[scaffold_id].append(hit)

        # report summary stats for each scaffold
        reference_out = os.path.join(self.output_dir, 'references.tsv')
        fout = open(reference_out, 'w')
        fout.write('Scaffold ID\tSubject genome IDs\tSubject scaffold IDs')
        fout.write('\tGenome ID\tLength (bp)\tGC\tMean coverage')
        fout.write('\t# genes\t# hits\t% genes\tAvg. align. length (bp)\tAvg. % identity\tAvg. e-value\tAvg. bitscore\n')

        for scaffold_id, hits in hits_to_scaffold.items():
            aln_len = []
            perc_iden = []
            evalue = []
            bitscore = []
            subject_scaffold_ids = defaultdict(int)
            subject_bin_ids = defaultdict(int)
            for hit in hits:
                aln_len.append(hit.aln_length)
                perc_iden.append(hit.perc_identity)
                evalue.append(hit.evalue)
                bitscore.append(hit.bitscore)

                subject_bin_id, subject_gene_id  = hit.subject_id.split('~')
                subject_scaffold_id = subject_gene_id[0:subject_gene_id.rfind('_')]
                subject_scaffold_ids[subject_scaffold_id] += 1
                subject_bin_ids[subject_bin_id] += 1
               

            sorted_subject_bin_ids = sorted(subject_bin_ids.items(), 
                                                key=operator.itemgetter(1),
                                                reverse=True)
            subject_bin_id_str = []
            for bin_id, num_hits in sorted_subject_bin_ids:
                subject_bin_id_str.append(bin_id + ':' + str(num_hits))
            subject_bin_id_str = ','.join(subject_bin_id_str)

            sorted_subject_scaffold_ids = sorted(subject_scaffold_ids.items(), 
                                                    key=operator.itemgetter(1),
                                                    reverse=True)
            subject_scaffold_id_str = []
            for subject_id, num_hits in sorted_subject_scaffold_ids:
                subject_scaffold_id_str.append(subject_id + ':' + str(num_hits))
            subject_scaffold_id_str = ','.join(subject_scaffold_id_str)

            fout.write('%s\t%s\t%s\t%s\t%.2f\t%d\t%d\t%.2f\t%d\t%.2f\t%.2g\t%.2f\n' % (
                                                                        scaffold_id,
                                                                        subject_bin_id_str,
                                                                        subject_scaffold_id_str,
                                                                        scaffold_stats.print_stats(scaffold_id),
                                                                        mean(scaffold_stats.coverage(scaffold_id)),
                                                                        num_genes_on_scaffold[scaffold_id],
                                                                        len(hits),
                                                                        len(hits) * 100.0 / num_genes_on_scaffold[scaffold_id],
                                                                        mean(aln_len),
                                                                        mean(perc_iden),
                                                                        mean(evalue),
                                                                        mean(bitscore)))

        fout.close()

        return reference_out
예제 #42
0
    def run(self, gtdb_bac_taxonomy_file, gtdb_ar_taxonomy_file,
            gtdb_path_file, gtdb_metadata_file, output_dir):
        """Create FASTA files with all 16S and 23S rRNA sequences from GTDB genomes."""

        # get User ID to UBA translation
        print('Reading GTDB metadata to translate User IDs to UBA IDs.')
        user_id_to_uba = {}
        with open(gtdb_metadata_file) as f:
            f.readline()

            for line in f:
                line_split = line.strip().split('\t')
                gid = line_split[0]
                org_name = line_split[1]
                if '(UBA' in org_name:
                    uba_id = org_name.split('(')[-1].replace(')', '')
                    user_id_to_uba[gid] = uba_id

        # read GTDB taxonomy
        print('Reading GTDB taxonomy.')
        gtdb_bac_taxonomy = Taxonomy().read(gtdb_bac_taxonomy_file)
        gtdb_ar_taxonomy = Taxonomy().read(gtdb_ar_taxonomy_file)
        gtdb_taxonomy = gtdb_bac_taxonomy.copy()
        gtdb_taxonomy.update(gtdb_ar_taxonomy)

        print('Identified %d bacterial genomes to process.' %
              len(gtdb_bac_taxonomy))
        print('Identified %d archaeal genomes to process.' %
              len(gtdb_ar_taxonomy))
        print('Identified %d genomes to process.' % len(gtdb_taxonomy))

        # read genome paths
        print('Reading path to genomes.')
        genome_paths = {}
        for line in open(gtdb_path_file):
            gid, gid_path = line.strip().split('\t')
            if gid in user_id_to_uba:
                gid = user_id_to_uba[gid]

            genome_paths[gid] = gid_path

        # sanity check data
        missing_paths = set(gtdb_taxonomy.keys()) - set(genome_paths.keys())
        if len(missing_paths) > 0:
            print(
                '[WARNING] There are %d genomes in the taxonomy file without a specified genome path.'
                % len(missing_paths))

        # create FASTA file with 16S and 23S rRNA sequence files
        print('Parsing 16S and 23S rRNA sequence files.')
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        fout_16S = open(os.path.join(output_dir, 'ssu.fna'), 'w')
        fout_23S = open(os.path.join(output_dir, 'lsu.fna'), 'w')
        missing_ssu = 0
        missing_lsu = 0
        for i, gid in enumerate(gtdb_taxonomy):
            if i % 1000 == 0:
                print('Processed %d genomes.' % i)

            if gid not in genome_paths:
                print(
                    '[WARNING] Genome %s does not have a specified genome path.'
                    % gid)
                continue

            genome_path = genome_paths[gid]

            ssu_file = os.path.join(genome_path, 'rna_silva', 'ssu.fna')
            if not os.path.exists(ssu_file):
                missing_ssu += 1
                continue

            ssu_info_file = os.path.join(genome_path, 'rna_silva',
                                         'ssu.hmm_summary.tsv')
            ssu_info = {}
            with open(ssu_info_file) as f:
                header = f.readline().strip().split('\t')
                contig_len_index = header.index('Sequence length')

                for line in f:
                    line_split = line.strip().split('\t')

                    gene_id = line_split[0]
                    contig_length = int(line_split[contig_len_index])
                    ssu_info[gene_id] = contig_length

            for ssu_index, (seq_id,
                            seq) in enumerate(seq_io.read_seq(ssu_file)):
                fout_16S.write('>%s~%s [ssu=%d bp] [contig=%d bp]\n' %
                               (gid, seq_id, len(seq), ssu_info[seq_id]))
                fout_16S.write('%s\n' % seq)

            lsu_file = os.path.join(genome_path, 'rna_silva', 'lsu_23S.fna')
            if not os.path.exists(lsu_file):
                missing_lsu += 1
                continue

            lsu_info_file = os.path.join(genome_path, 'rna_silva',
                                         'lsu_23S.hmm_summary.tsv')
            lsu_info = {}
            with open(lsu_info_file) as f:
                header = f.readline().strip().split('\t')
                contig_len_index = header.index('Sequence length')

                for line in f:
                    line_split = line.strip().split('\t')

                    gene_id = line_split[0]
                    contig_length = int(line_split[contig_len_index])
                    lsu_info[gene_id] = contig_length

            for lsu_index, (seq_id,
                            seq) in enumerate(seq_io.read_seq(lsu_file)):
                fout_23S.write('>%s~%s [ssu=%d bp] [contig=%d bp]\n' %
                               (gid, seq_id, len(seq), lsu_info[seq_id]))
                fout_23S.write('%s\n' % seq)

        fout_16S.close()
        fout_23S.close()

        print(
            'There were %d of %d (%.2f%%) genomes without an identifier 16S rRNA gene.'
            % (missing_ssu, len(gtdb_taxonomy),
               missing_ssu * 100.0 / len(gtdb_taxonomy)))

        print(
            'There were %d of %d (%.2f%%) genomes without an identifier 23S rRNA gene.'
            % (missing_lsu, len(gtdb_taxonomy),
               missing_lsu * 100.0 / len(gtdb_taxonomy)))