Esempio n. 1
0
    def _prefix_gene_identifiers(self, gene_files, keep_headers, output_file):
        """Prefix all gene IDs with genome IDs: <genome_id>~<gene_id>.
        
        Parameters
        ----------
        gene_files : list of str
            Genes in fasta files to modify.
        keep_headers : boolean
            If True, indicates FASTA headers already have the format <genome_id>~<gene_id>.
        output_file : str
            Name of FASTA file to contain modified genes.
        """

        fout = open(output_file, 'w')
        for gf in gene_files:
            genome_id = remove_extension(gf)
            if genome_id.endswith('_genes'):
                genome_id = genome_id[0:genome_id.rfind('_genes')]

            for seq_id, seq, annotation in seq_io.read_fasta_seq(
                    gf, keep_annotation=True):
                if keep_headers:
                    fout.write('>' + seq_id + ' ' + annotation + '\n')
                else:
                    fout.write('>' + genome_id + '~' + seq_id + ' ' +
                               annotation + '\n')
                fout.write(seq + '\n')
        fout.close()
Esempio n. 2
0
    def write_gene_file(self, gene_out, gene_dir, genome_list, taxonomy, genes_to_ignore):
        """Write genes to output stream.

        Parameters
        ----------
        gene_out : stream
            Output stream.
        gene_dir : str
            Directory containing called genes in amino acid space.
        genome_list : iterable
            Genomes to process.
        genes_to_ignore : set
            Genes which should not be written to file.
        """

        genes_kept = 0
        for genome_id in genome_list:
            genome_gene_file = os.path.join(gene_dir, genome_id + '.faa')
            if not os.path.exists(genome_gene_file):
                print '[WARNING] Missing gene file for genome %s.' % genome_gene_file
                continue

            if os.stat(genome_gene_file).st_size == 0:
                print '[WARNING] Gene file is empty for genome %s.' % genome_gene_file
                continue

            for gene_id, seq, annotation in seq_io.read_fasta_seq(genome_gene_file, keep_annotation=True):
                if gene_id in genes_to_ignore:
                    continue

                gene_out.write('>' + gene_id + ' ' + annotation + '\n')
                gene_out.write(seq + '\n')
                genes_kept += 1

        return genes_kept
Esempio n. 3
0
 def _prefix_gene_identifiers(self, gene_files, keep_headers, output_file):
     """Prefix all gene IDs with genome IDs: <genome_id>~<gene_id>.
     
     Parameters
     ----------
     gene_files : list of str
         Genes in fasta files to modify.
     keep_headers : boolean
         If True, indicates FASTA headers already have the format <genome_id>~<gene_id>.
     output_file : str
         Name of FASTA file to contain modified genes.
     """
     
     fout = open(output_file, 'w')
     for gf in gene_files:           
         genome_id = remove_extension(gf)
         if genome_id.endswith('_genes'):
             genome_id = genome_id[0:genome_id.rfind('_genes')]
             
         for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True):
             if keep_headers:
                 fout.write('>' + seq_id  + ' ' + annotation + '\n')
             else:
                 fout.write('>' + genome_id + '~' + seq_id  + ' ' + annotation + '\n')
             fout.write(seq + '\n')
     fout.close()
Esempio n. 4
0
    def amend_gene_identifies(self, gene_dir, output_dir):
        """Modify gene ids to include source genome id.

        The following format is used:
          <genome_id>~<gene_id>

        Parameters
        ----------
        gene_dir : str
            Directory with fasta files containing protein sequences.
        output_dir : float
            Directory to contain modified fasta files.
        """

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        for f in os.listdir(gene_dir):
            gf = os.path.join(gene_dir, f)
            genome_id = remove_extension(gf)

            aa_file = os.path.join(output_dir, genome_id + '.faa')
            fout = open(aa_file, 'w')
            for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True):
                fout.write('>%s~%s %s\n' % (genome_id, seq_id, annotation))
                if seq[-1] == '*':
                    seq = seq[0:-1]
                fout.write(seq + '\n')
            fout.close()
Esempio n. 5
0
    def amend_gene_identifies(self, gene_dir, output_dir):
        """Modify gene ids to include source genome id.

        The following format is used:
          <gene_id>~<genome_id>

        Parameters
        ----------
        gene_dir : str
            Directory with fasta files containing protein sequences.
        output_dir : float
            Directory to contain modified fasta files.
        """

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        for f in os.listdir(gene_dir):
            gf = os.path.join(gene_dir, f)
            genome_id = remove_extension(gf)

            aa_file = os.path.join(output_dir, genome_id + '.faa')
            fout = open(aa_file, 'w')
            for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True):
                fout.write('>' + seq_id + '~' + genome_id + ' ' + annotation + '\n')
                if seq[-1] == '*':
                    seq = seq[0:-1]
                fout.write(seq + '\n')
            fout.close()
Esempio n. 6
0
    def img_gene_id_to_scaffold_id(self, genome_dir, genome_id, output_dir):
        """Modify IMG gene ids to format which explicitly gives scaffold names.

        For downstream processing it is often necessary to know which scaffold
        a gene is contained on. IMG uses unique identifiers for genes. As such,
        these are changed to the following format:

        <scaffold_id>_<gene #> <annotation> [IMG gene id]

        Parameters
        ----------
        genome_dir : str
            Directory with files for genome.
        genome_id : str
            Unique identifier of genome.
        output_dir : float
            Directory to contain modified fasta files.
        """

        # determine source scaffold for each gene
        gene_id_to_scaffold_id = {}
        gene_number = defaultdict(int)
        for line in open(os.path.join(genome_dir, genome_id + '.gff')):
            if line[0] == '#':
                continue

            line_split = line.split('\t')
            scaffold_id = line_split[0]
            info = line_split[8]
            if info != '':  # this will be empty for non-protein coding genes
                gene_id = info.split(';')[0].replace('ID=', '')

                gene_number[scaffold_id] += 1
                gene_id_to_scaffold_id[gene_id] = scaffold_id + '_' + str(gene_number[scaffold_id])

        # write out gene file with modified identifiers
        genome_gene_file = os.path.abspath(os.path.join(genome_dir, genome_id + '.genes.faa'))

        fout = open(os.path.join(output_dir, genome_id + '.faa'), 'w')
        for gene_id, seq, annotation in seq_io.read_fasta_seq(genome_gene_file, keep_annotation=True):

            annotation = annotation[annotation.find(' ') + 1:]  # remove additional gene id from annotation
            annotation += ' [IMG Gene ID: ' + gene_id + ']'  # append IMG gene id for future reference

            fout.write('>' + gene_id_to_scaffold_id[gene_id] + ' ' + annotation + '\n')
            fout.write(seq + '\n')
        fout.close()
Esempio n. 7
0
    def reformat_gene_id_to_scaffold_id(self, gene_file, gff_file, taxonomy, output_file):
        """Reformat gene ids to format which explicitly gives scaffold names.

        <genome_id>~<scaffold_id>_<gene_#> [gtdb_taxonomy] [NCBI organism name] [annotation]

        Parameters
        ----------
        gene_file : str
            Gene file for genome.
        gff_file : str
            General feature file (GFF) for genome.
        output_file : float
            File to contain modified gene fasta file.
        """

        # determine source scaffold for each gene
        gene_id_to_scaffold_id = {}
        gene_number = defaultdict(int)
        for line in open(gff_file):
            if line.startswith('##FASTA'):
                # start of FASTA section with individual sequences
                break

            if line[0] == '#':
                continue

            line_split = line.split('\t')
            scaffold_id = line_split[0]
            info = line_split[8]
            if info != '':  # this will be empty for non-protein coding genes
                gene_id = info.split(';')[0].replace('ID=', '')

                gene_number[scaffold_id] += 1
                gene_id_to_scaffold_id[gene_id] = scaffold_id + '_' + str(gene_number[scaffold_id])

        # write out gene file with modified identifiers
        fout = open(output_file, 'w')
        for gene_id, seq, annotation in seq_io.read_fasta_seq(gene_file, keep_annotation=True):
            genome_id = remove_extension(gene_file)
            fout.write('>%s [%s] [%s] [%s]\n' % (gene_id_to_scaffold_id[gene_id],
                                                    ';'.join(taxonomy.get(genome_id, ['none'])),
                                                    'none',
                                                    annotation))
            fout.write(seq + '\n')
        fout.close()
Esempio n. 8
0
    def write_gene_file(self, gene_out, gene_dir, genome_list, taxonomy, genes_to_ignore):
        """Write genes to output stream.

        Parameters
        ----------
        gene_out : stream
            Output stream.
        gene_dir : str
            Directory containing called genes in amino acid space.
        genome_list : iterable
            Genomes to process.
        genes_to_ignore : set
            Genes which should not be written to file.
        """

        genes_kept = 0
        for genome_id in genome_list:
            genome_gene_file = os.path.join(gene_dir, genome_id + '.faa')
            if not os.path.exists(genome_gene_file):
                print '[WARNING] Missing gene file for genome %s.' % genome_gene_file
                continue

            if os.stat(genome_gene_file).st_size == 0:
                print '[WARNING] Gene file is empty for genome %s.' % genome_gene_file
                continue

            for gene_id, seq, annotation in seq_io.read_fasta_seq(genome_gene_file, keep_annotation=True):
                if gene_id in genes_to_ignore:
                    continue

                # IMG headers sometimes contain non-ascii characters which cause
                # problems with BLAST and DIAMOND so there are explicitly filtered out
                annotation = filter(lambda x: x in string.printable, annotation)

                # a few IMG genomes contain protein sequences which start with a hyphen
                if seq[0] == '-':
                    seq = seq[1:]

                gene_out.write('>' + gene_id + ' ' + annotation + '\n')
                gene_out.write(seq + '\n')
                genes_kept += 1

        return genes_kept
Esempio n. 9
0
    def rblast(self, options):
        """Reciprocal blast command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - rblast] Performing reciprocal blast between genomes.')
        self.logger.info('*******************************************************************************')

        check_dir_exists(options.protein_dir)
        make_sure_path_exists(options.output_dir)

        aa_gene_files = []
        for f in os.listdir(options.protein_dir):
            if f.endswith(options.protein_ext):
                aa_gene_files.append(os.path.join(options.protein_dir, f))

        if not aa_gene_files:
            self.logger.warning('  [Warning] No gene files found. Check the --protein_ext flag used to identify gene files.')
            sys.exit()

        # modify gene ids to include genome ids in order to ensure
        # all gene identifiers are unique across the set of genomes,
        # also removes the trailing asterisk used to identify the stop
        # codon
        self.logger.info('')
        self.logger.info('  Appending genome identifiers to all gene identifiers.')
        gene_out_dir = os.path.join(options.output_dir, 'genes')
        make_sure_path_exists(gene_out_dir)
        modified_aa_gene_files = []
        for gf in aa_gene_files:
            genome_id = remove_extension(gf)

            aa_file = os.path.join(gene_out_dir, genome_id + '.faa')
            fout = open(aa_file, 'w')
            for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True):
                fout.write('>' + seq_id + '~' + genome_id + ' ' + annotation + '\n')
                if seq[-1] == '*':
                    seq = seq[0:-1]
                fout.write(seq + '\n')
            fout.close()

            modified_aa_gene_files.append(aa_file)

        # perform the reciprocal blast with blastp or diamond
        self.logger.info('')
        if options.blastp:
            rblast = ReciprocalBlast(options.cpus)
            rblast.run(modified_aa_gene_files, options.evalue, options.output_dir)

            # concatenate all blast tables to mimic output of diamond, all hits
            # for a given genome MUST be in consecutive order to fully mimic
            # the expected results from diamond
            self.logger.info('')
            self.logger.info('  Creating single file with all blast hits (be patient!).')
            blast_files = sorted([f for f in os.listdir(options.output_dir) if f.endswith('.blastp.tsv')])
            hit_tables = [os.path.join(options.output_dir, f) for f in blast_files]
            concatenate_files(hit_tables, os.path.join(options.output_dir, 'all_hits.tsv'))
        else:
            rdiamond = ReciprocalDiamond(options.cpus)
            rdiamond.run(modified_aa_gene_files, options.evalue, options.per_identity, options.output_dir)

        self.logger.info('')
        self.logger.info('  Reciprocal blast hits written to: %s' % options.output_dir)

        self.time_keeper.print_time_stamp()
Esempio n. 10
0
    def run(self, query_gene_file, target_gene_file, sorted_hit_table,
            evalue_threshold, per_iden_threshold, per_aln_len_threshold,
            keep_rbhs, output_dir):
        """Calculate amino acid identity (AAI) between pairs of genomes.

        Parameters
        ----------
        query_gene_file : str
            File with all query genes in FASTA format.
        target_gene_file : str or None
            File with all target genes in FASTA format, or None if performing a reciprocal AAI calculation.
        sorted_hit_table : str
            Sorted table indicating genes with sequence similarity.
        evalue_threshold : float
            Evalue threshold used to define a homologous gene.
        per_identity_threshold : float
            Percent identity threshold used to define a homologous gene.
        per_aln_len_threshold : float
            Alignment length threshold used to define a homologous gene.
        keep_rbhs : boolean
            Flag indicating if RBH should be written to file.
        output_dir : str
            Directory to store AAI results.
        """

        self.sorted_hit_table = sorted_hit_table
        self.evalue_threshold = evalue_threshold
        self.per_identity_threshold = per_iden_threshold
        self.per_aln_len_threshold = per_aln_len_threshold
        self.keep_rbhs = keep_rbhs
        self.output_dir = output_dir

        # calculate length of genes and number of genes in each genome
        self.logger.info('Calculating length of genes.')
        self.gene_lengths = {}
        self.query_gene_count = defaultdict(int)
        query_genomes = set()
        for seq_id, seq in seq_io.read_fasta_seq(query_gene_file):
            if seq[-1] == '*':
                self.gene_lengths[seq_id] = len(seq) - 1
            else:
                self.gene_lengths[seq_id] = len(seq)

            genome_id = seq_id[0:seq_id.find('~')]
            self.query_gene_count[genome_id] += 1
            query_genomes.add(genome_id)

        self.target_gene_count = defaultdict(int)
        target_genomes = set()
        if target_gene_file:
            for seq_id, seq in seq_io.read_fasta_seq(target_gene_file):
                if seq[-1] == '*':
                    self.gene_lengths[seq_id] = len(seq) - 1
                else:
                    self.gene_lengths[seq_id] = len(seq)

                genome_id = seq_id[0:seq_id.find('~')]
                self.target_gene_count[genome_id] += 1
                target_genomes.add(genome_id)
        else:
            self.target_gene_count = self.query_gene_count

        # get byte offset of hits from each genome
        self.logger.info('Indexing sorted hit table.')
        self.offset_table = self._genome_offsets(self.sorted_hit_table)

        # calculate AAI between each pair of genomes in parallel
        if target_genomes:
            # compare query genomes to target genomes
            self.num_pairs = len(query_genomes) * len(target_genomes)
            self.logger.info(
                'Calculating AAI between %d query and %d target genomes:' %
                (len(query_genomes), len(target_genomes)))
        else:
            # compute pairwise values between target genomes
            ng = len(query_genomes)
            self.num_pairs = (ng * ng - ng) / 2
            self.logger.info(
                'Calculating AAI between all %d pairs of genomes:' %
                self.num_pairs)

        if self.num_pairs == 0:
            self.logger.warning('No genome pairs identified.')
            return

        genome_id_lists = []
        query_genomes = list(query_genomes)
        target_genomes = list(target_genomes)
        for i in xrange(0, len(query_genomes)):
            genome_idI = query_genomes[i]

            if target_genomes:
                genome_id_list = target_genomes
            else:
                genome_id_list = []
                for j in xrange(i + 1, len(query_genomes)):
                    genome_idJ = query_genomes[j]
                    genome_id_list.append(genome_idJ)

            genome_id_lists.append((genome_idI, genome_id_list))

        self.processed_paired = 0
        parallel = Parallel(self.cpus)

        progress_func = self._progress
        if self.logger.is_silent:
            progress_func = None
        consumer_data = parallel.run(self._producer, self._consumer,
                                     genome_id_lists, progress_func)

        # write results for each genome pair
        self.logger.info('Summarizing AAI results.')
        aai_summay_file = os.path.join(output_dir, 'aai_summary.tsv')
        fout = open(aai_summay_file, 'w')
        fout.write(
            'Genome A\tGenes in A\tGenome B\tGenes in B\t# orthologous genes\tMean AAI\tStd AAI\tOrthologous fraction (OF)\n'
        )

        for data in consumer_data:
            fout.write('%s\t%d\t%s\t%d\t%d\t%.2f\t%.2f\t%.2f\n' % data)

        fout.close()

        # concatenate RBH files
        rbh_output_file = None
        if self.keep_rbhs:
            self.logger.info('Concatenating RBH files.')
            rbh_files = []
            for genome_id in query_genomes:
                rbh_files.append(
                    os.path.join(self.output_dir, genome_id + '.rbh.tsv'))

            rbh_output_file = os.path.join(self.output_dir, 'rbh.tsv')
            concatenate_files(rbh_files, rbh_output_file, common_header=True)

            for f in rbh_files:
                os.remove(f)

        return aai_summay_file, rbh_output_file
Esempio n. 11
0
    def extract_homologs_and_context(self, homologs, db_file, output_file):
        """Extract homologs sequences from database file, and local gene context.

        This function extract sequences information for each
        homolog and writes this to file for downstream processing.
        In addition, it determines the local gene context for each
        gene. Specifically, it saves the annotations for the
        3 genes prior to and after a given gene.

        This function assumes the database is sorted according
        to the order genes are identified on each contig.

        Parameters
        ----------
        homologs : iterable
            Unique identifiers of sequences to extract
        db_file : str
            Fasta file with sequences.
        output_file : str
            File to write homologs.

        Returns
        -------
        dict
            d[seq_id] -> list of annotations for pre-context genes
        dict
            d[seq_id] -> list of annotations for post-context genes
        """

        gene_precontext = {}
        gene_postcontext = {}

        if len(homologs) == 0:
            return gene_precontext, gene_postcontext

        if type(homologs) is not set:
            homologs = set(homologs)

        fout = open(output_file, 'w')
        local_context = [('unknown~unknown_x', None)] * 3
        post_context_counter = {}
        for seq_id, seq, annotation in seq_io.read_fasta_seq(
                db_file, keep_annotation=True):
            if seq_id in homologs:
                fout.write('>' + seq_id + ' ' + annotation + '\n')
                fout.write(seq + '\n')

                gene_precontext[seq_id] = list(local_context)
                post_context_counter[seq_id] = 3

            # record 3 precontext genes
            local_context[0] = local_context[1]
            local_context[1] = local_context[2]
            local_context[2] = (seq_id, annotation)

            # record 3 postcontext genes
            if len(post_context_counter):
                key_to_remove = None
                for seq_id, count in post_context_counter.iteritems():
                    count -= 1
                    if count == -1:
                        gene_postcontext[seq_id] = list(local_context)
                        key_to_remove = seq_id
                    else:
                        post_context_counter[seq_id] = count

                if key_to_remove:
                    post_context_counter.pop(key_to_remove)

        fout.close()

        # filter gene context to contain only genes on the same scaffold
        gene_precontext = self._filter_gene_context(gene_precontext)
        gene_postcontext = self._filter_gene_context(gene_postcontext)

        return gene_precontext, gene_postcontext
Esempio n. 12
0
    def run(self, query_gene_file,
                    target_gene_file,
                    sorted_hit_table, 
                    evalue_threshold, 
                    per_iden_threshold, 
                    per_aln_len_threshold,
                    keep_rbhs,
                    output_dir):
        """Calculate amino acid identity (AAI) between pairs of genomes.

        Parameters
        ----------
        query_gene_file : str
            File with all query genes in FASTA format.
        target_gene_file : str or None
            File with all target genes in FASTA format, or None if performing a reciprocal AAI calculation.
        sorted_hit_table : str
            Sorted table indicating genes with sequence similarity.
        evalue_threshold : float
            Evalue threshold used to define a homologous gene.
        per_identity_threshold : float
            Percent identity threshold used to define a homologous gene.
        per_aln_len_threshold : float
            Alignment length threshold used to define a homologous gene.
        keep_rbhs : boolean
            Flag indicating if RBH should be written to file.
        output_dir : str
            Directory to store AAI results.
        """

        self.sorted_hit_table = sorted_hit_table
        self.evalue_threshold = evalue_threshold
        self.per_identity_threshold = per_iden_threshold
        self.per_aln_len_threshold = per_aln_len_threshold
        self.keep_rbhs = keep_rbhs
        self.output_dir = output_dir

        # calculate length of genes and number of genes in each genome
        self.logger.info('Calculating length of genes.')
        self.gene_lengths = {}
        self.query_gene_count = defaultdict(int)
        query_genomes = set()
        for seq_id, seq in seq_io.read_fasta_seq(query_gene_file):
            if seq[-1] == '*':
                self.gene_lengths[seq_id] = len(seq) - 1
            else:
                self.gene_lengths[seq_id] = len(seq)
                
            genome_id = seq_id[0:seq_id.find('~')]
            self.query_gene_count[genome_id] += 1
            query_genomes.add(genome_id)
            
        self.target_gene_count = defaultdict(int)
        target_genomes = set()
        if target_gene_file:
            for seq_id, seq in seq_io.read_fasta_seq(target_gene_file):
                if seq[-1] == '*':
                    self.gene_lengths[seq_id] = len(seq) - 1
                else:
                    self.gene_lengths[seq_id] = len(seq)
                    
                genome_id = seq_id[0:seq_id.find('~')]
                self.target_gene_count[genome_id] += 1
                target_genomes.add(genome_id)
        else:
            self.target_gene_count = self.query_gene_count

        # get byte offset of hits from each genome
        self.logger.info('Indexing sorted hit table.')
        self.offset_table = self._genome_offsets(self.sorted_hit_table)

        # calculate AAI between each pair of genomes in parallel
        if target_genomes:
            # compare query genomes to target genomes
            self.num_pairs = len(query_genomes) * len(target_genomes)
            self.logger.info('Calculating AAI between %d query and %d target genomes:' % (len(query_genomes), len(target_genomes)))
        else:
            # compute pairwise values between target genomes
            ng = len(query_genomes)
            self.num_pairs = (ng*ng - ng) / 2
            self.logger.info('Calculating AAI between all %d pairs of genomes:' % self.num_pairs)
            
        if self.num_pairs == 0:
            self.logger.warning('No genome pairs identified.')
            return

        genome_id_lists = []
        query_genomes = list(query_genomes)
        target_genomes = list(target_genomes)
        for i in range(0, len(query_genomes)):
            genome_idI = query_genomes[i]
            
            if target_genomes:
                genome_id_list = target_genomes
            else:
                genome_id_list = []
                for j in range(i + 1, len(query_genomes)):
                    genome_idJ = query_genomes[j]
                    genome_id_list.append(genome_idJ)

            genome_id_lists.append((genome_idI, genome_id_list))

        self.processed_paired = 0
        parallel = Parallel(self.cpus)
        
        progress_func = self._progress
        if self.logger.is_silent:
            progress_func = None
        consumer_data = parallel.run(self._producer, self._consumer, genome_id_lists, progress_func)

        # write results for each genome pair
        self.logger.info('Summarizing AAI results.')
        aai_summay_file = os.path.join(output_dir, 'aai_summary.tsv')
        fout = open(aai_summay_file, 'w')
        fout.write('#Genome A\tGenes in A\tGenome B\tGenes in B\t# orthologous genes\tMean AAI\tStd AAI\tOrthologous fraction (OF)\n')

        for data in consumer_data:
            fout.write('%s\t%d\t%s\t%d\t%d\t%.2f\t%.2f\t%.2f\n' % data)

        fout.close()

        # concatenate RBH files
        rbh_output_file = None
        if self.keep_rbhs:
            self.logger.info('Concatenating RBH files.')
            rbh_files = []
            for genome_id in query_genomes:
                rbh_files.append(os.path.join(self.output_dir, genome_id + '.rbh.tsv'))
                
            rbh_output_file = os.path.join(self.output_dir, 'rbh.tsv')
            concatenate_files(rbh_files, rbh_output_file, common_header=True)
            
            for f in rbh_files:
                os.remove(f)
                
        return aai_summay_file, rbh_output_file