Esempio n. 1
0
    def _run_reciprocal_diamond(self, query_gene_file, target_gene_file,
                                evalue, per_identity, per_aln_len, max_hits,
                                sensitive, high_mem, tmp_dir, output_dir):
        """Perform similarity search of query genes against target genes, and reciprocal hits.

        Parameters
        ----------
        query_gene_file : str
            File with all query proteins.
        target_gene_file : str
            File with all target proteins.
        evalue : float
            E-value threshold for reporting hits.
        per_identity : float
            Percent identity threshold for reporting hits.
        per_aln_len : float
            Percent query coverage threshold for reporting hits.
        max_hits : int
            Maximum number of hits to report per query sequences.
        tmp_dir : str
            Directory to store temporary files.
        output_dir : str
            Directory to store blast results.
        """

        self.logger.info(
            'Creating DIAMOND database of query proteins (be patient!).')
        diamond = Diamond(self.cpus)
        query_diamond_db = os.path.join(output_dir, 'query_genes')
        diamond.create_db(query_gene_file, query_diamond_db)

        self.logger.info(
            'Creating DIAMOND database of target proteins (be patient!).')
        target_diamond_db = os.path.join(output_dir, 'target_genes')
        diamond.create_db(target_gene_file, target_diamond_db)

        # blast query genes against target proteins
        self.logger.info(
            'Performing similarity sequence between query and target proteins (be patient!).'
        )

        if tmp_dir:
            tmp_query_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_query_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', delete=False)
        tmp_query_hits_table.close()

        query_hits_daa_file = os.path.join(output_dir, 'query_hits')

        if high_mem:
            diamond.blastp(query_gene_file,
                           target_diamond_db,
                           evalue,
                           per_identity,
                           per_aln_len,
                           max_hits,
                           sensitive,
                           tmp_query_hits_table.name,
                           'standard',
                           tmp_dir,
                           chunk_size=1,
                           block_size=8)
        else:
            diamond.blastp(query_gene_file, target_diamond_db, evalue,
                           per_identity, per_aln_len, max_hits, sensitive,
                           tmp_query_hits_table.name, 'standard', tmp_dir)

        # get target genes hit by one or more query proteins
        self.logger.info(
            'Creating file with target proteins with similarity to query proteins.'
        )
        target_hit = set()
        for line in open(tmp_query_hits_table.name):
            line_split = line.split('\t')
            target_hit.add(line_split[1])

        target_genes_hits = os.path.join(output_dir, 'target_genes_hit.faa')
        fout = open(target_genes_hits, 'w')
        for seq_id, seq in seq_io.read_seq(target_gene_file):
            if seq_id in target_hit:
                fout.write('>' + seq_id + '\n')
                fout.write(seq + '\n')
        fout.close()

        self.logger.info(
            'Identified %d target proteins to be used in reciprocal search.' %
            len(target_hit))

        # perform reciprocal blast
        self.logger.info(
            'Performing reciprocal similarity sequence between target and query proteins (be patient!).'
        )

        if tmp_dir:
            tmp_target_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_target_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', delete=False)
        tmp_target_hits_table.close()

        if high_mem:
            diamond.blastp(target_genes_hits,
                           query_diamond_db,
                           evalue,
                           per_identity,
                           per_aln_len,
                           max_hits,
                           sensitive,
                           tmp_target_hits_table.name,
                           'standard',
                           tmp_dir,
                           chunk_size=1,
                           block_size=8)
        else:
            diamond.blastp(target_genes_hits, query_diamond_db, evalue,
                           per_identity, per_aln_len, max_hits, sensitive,
                           tmp_target_hits_table.name, 'standard', tmp_dir)

        # combine hit tables and sort
        os.system('cat %s >> %s' %
                  (tmp_target_hits_table.name, tmp_query_hits_table.name))
        os.remove(tmp_target_hits_table.name)
        hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv')
        self._sort_hit_table(tmp_query_hits_table.name, hits_table_file)
Esempio n. 2
0
    def run(self, scaffold_gene_file, stat_file, ref_genome_gene_files, db_file, evalue, per_identity, per_aln_len):
        """Create taxonomic profiles for a set of genomes.

        Parameters
        ----------
        scaffold_gene_file : str
            Fasta file of genes on scaffolds in amino acid space.
        stat_file : str
            File with statistics for individual scaffolds.
        ref_genome_gene_files : list of str
            Fasta files of called genes on reference genomes of interest.
        db_file : str
            Database of competing reference genes.
        evalue : float
            E-value threshold of valid hits.
        per_identity : float
            Percent identity threshold of valid hits [0,100].
        per_aln_len : float
            Percent query coverage of valid hits [0, 100].
        """

        # read statistics file
        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(stat_file)

        # perform homology searches
        self.logger.info('Creating DIAMOND database for reference genomes.')
        ref_gene_file = os.path.join(self.output_dir, 'ref_genes.faa')
        concatenate_gene_files(ref_genome_gene_files, ref_gene_file)

        diamond = Diamond(self.cpus)
        ref_diamond_db = os.path.join(self.output_dir, 'ref_genes')
        diamond.create_db(ref_gene_file, ref_diamond_db)

        self.logger.info('Identifying homologs within reference genomes of interest (be patient!).')
        self.diamond_dir = os.path.join(self.output_dir, 'diamond')
        make_sure_path_exists(self.diamond_dir)
        hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv')
        diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, per_aln_len, 1, False, hits_ref_genomes)

        self.logger.info('Identifying homologs within competing reference genomes (be patient!).')
        hits_comp_ref_genomes = os.path.join(self.diamond_dir, 'competing_ref_hits.tsv')
        diamond.blastp(scaffold_gene_file, db_file, evalue, per_identity, per_aln_len, 1, False, hits_comp_ref_genomes)

        # get list of genes with a top hit to the reference genomes of interest
        hits_to_ref = self._top_hits_to_reference(hits_ref_genomes, hits_comp_ref_genomes)

        # get number of genes on each scaffold
        num_genes_on_scaffold = defaultdict(int)
        for seq_id, _seq in seq_io.read_seq(scaffold_gene_file):
            scaffold_id = seq_id[0:seq_id.rfind('_')]
            num_genes_on_scaffold[scaffold_id] += 1

        # get hits to each scaffold
        hits_to_scaffold = defaultdict(list)
        for query_id, hit in hits_to_ref.items():
            gene_id = query_id[0:query_id.rfind('~')]
            scaffold_id = gene_id[0:gene_id.rfind('_')]
            hits_to_scaffold[scaffold_id].append(hit)

        # report summary stats for each scaffold
        reference_out = os.path.join(self.output_dir, 'references.tsv')
        fout = open(reference_out, 'w')
        fout.write('Scaffold ID\tSubject genome IDs\tSubject scaffold IDs')
        fout.write('\tGenome ID\tLength (bp)\tGC\tMean coverage')
        fout.write('\t# genes\t# hits\t% genes\tAvg. align. length (bp)\tAvg. % identity\tAvg. e-value\tAvg. bitscore\n')

        for scaffold_id, hits in hits_to_scaffold.items():
            aln_len = []
            perc_iden = []
            evalue = []
            bitscore = []
            subject_scaffold_ids = defaultdict(int)
            subject_bin_ids = defaultdict(int)
            for hit in hits:
                aln_len.append(hit.aln_length)
                perc_iden.append(hit.perc_identity)
                evalue.append(hit.evalue)
                bitscore.append(hit.bitscore)

                subject_bin_id, subject_gene_id  = hit.subject_id.split('~')
                subject_scaffold_id = subject_gene_id[0:subject_gene_id.rfind('_')]
                subject_scaffold_ids[subject_scaffold_id] += 1
                subject_bin_ids[subject_bin_id] += 1
               

            sorted_subject_bin_ids = sorted(subject_bin_ids.items(), 
                                                key=operator.itemgetter(1),
                                                reverse=True)
            subject_bin_id_str = []
            for bin_id, num_hits in sorted_subject_bin_ids:
                subject_bin_id_str.append(bin_id + ':' + str(num_hits))
            subject_bin_id_str = ','.join(subject_bin_id_str)

            sorted_subject_scaffold_ids = sorted(subject_scaffold_ids.items(), 
                                                    key=operator.itemgetter(1),
                                                    reverse=True)
            subject_scaffold_id_str = []
            for subject_id, num_hits in sorted_subject_scaffold_ids:
                subject_scaffold_id_str.append(subject_id + ':' + str(num_hits))
            subject_scaffold_id_str = ','.join(subject_scaffold_id_str)

            fout.write('%s\t%s\t%s\t%s\t%.2f\t%d\t%d\t%.2f\t%d\t%.2f\t%.2g\t%.2f\n' % (
                                                                        scaffold_id,
                                                                        subject_bin_id_str,
                                                                        subject_scaffold_id_str,
                                                                        scaffold_stats.print_stats(scaffold_id),
                                                                        mean(scaffold_stats.coverage(scaffold_id)),
                                                                        num_genes_on_scaffold[scaffold_id],
                                                                        len(hits),
                                                                        len(hits) * 100.0 / num_genes_on_scaffold[scaffold_id],
                                                                        mean(aln_len),
                                                                        mean(perc_iden),
                                                                        mean(evalue),
                                                                        mean(bitscore)))

        fout.close()

        return reference_out
Esempio n. 3
0
    def _run_self_diamond(self, query_gene_file, evalue, per_identity,
                          per_aln_len, max_hits, sensitive, high_mem, tmp_dir,
                          output_dir):
        """Perform similarity search of query genes against themselves.

        Parameters
        ----------
        query_gene_file : str
            File with all query sequences.
        evalue : float
            E-value threshold for reporting hits.
        per_identity : float
            Percent identity threshold for reporting hits.
        per_aln_len : float
            Percent query coverage threshold for reporting hits.
        max_hits : int
            Maximum number of hits to report per query sequences.
        tmp_dir : str
            Directory to store temporary files.
        output_dir : str
            Directory to store blast results.
        """

        self.logger.info('Creating DIAMOND database (be patient!).')

        diamond_db = os.path.join(output_dir, 'query_genes')
        diamond = Diamond(self.cpus)
        diamond.create_db(query_gene_file, diamond_db)

        # create flat hits table
        if tmp_dir:
            tmp_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', dir=tmp_dir, delete=False)
        else:
            tmp_hits_table = tempfile.NamedTemporaryFile(
                prefix='comparem_hits_', delete=False)
        tmp_hits_table.close()

        # blast all genes against the database
        self.logger.info(
            'Performing self similarity sequence between genomes (be patient!).'
        )

        if high_mem:
            diamond.blastp(query_gene_file,
                           diamond_db,
                           evalue,
                           per_identity,
                           per_aln_len,
                           max_hits,
                           sensitive,
                           tmp_hits_table.name,
                           'standard',
                           tmp_dir,
                           chunk_size=1,
                           block_size=8)
        else:
            diamond.blastp(query_gene_file, diamond_db, evalue, per_identity,
                           per_aln_len, max_hits, sensitive,
                           tmp_hits_table.name, 'standard', tmp_dir)

        # sort hit table
        hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv')
        self._sort_hit_table(tmp_hits_table.name, hits_table_file)