コード例 #1
0
ファイル: bamops.py プロジェクト: mdehollander/anvio
    def store_short_reads_for_splits(self):
        short_reds_for_splits_dict = self.get_short_reads_for_splits_dict()

        self.progress.new('Storing reads')
        self.progress.update('...')
        utils.store_dict_as_FASTA_file(short_reds_for_splits_dict, self.output_file_path)
        self.progress.end()

        self.run.info('Num reads stored', pp(len(short_reds_for_splits_dict)))
        self.run.info('FASTA output', self.output_file_path)
コード例 #2
0
ファイル: bamops.py プロジェクト: paczian/anvio
    def store_short_reads_for_splits(self):
        short_reds_for_splits_dict = self.get_short_reads_for_splits_dict()

        self.progress.new('Storing reads')
        self.progress.update('...')
        utils.store_dict_as_FASTA_file(short_reds_for_splits_dict, self.output_file_path)
        self.progress.end()

        self.run.info('Num reads stored', pp(len(short_reds_for_splits_dict)))
        self.run.info('FASTA output', self.output_file_path)
コード例 #3
0
    def store_short_reads_for_splits(self):
        self.sanity_check()

        if not self.sanity_checked:
            raise ConfigError(
                "store_short_reads_for_splits :: Cannot be called before running sanity_check"
            )

        short_reds_for_splits_dict = self.get_short_reads_for_splits_dict()

        self.progress.new("Storing reads")
        self.progress.update("...")

        if self.split_R1_and_R2:
            for read_type in sorted(list(short_reds_for_splits_dict.keys())):
                output_file_path = '%s_%s.fa' % (self.output_file_prefix,
                                                 read_type)

                utils.store_dict_as_FASTA_file(
                    short_reds_for_splits_dict[read_type], output_file_path)
                if self.gzip:
                    utils.gzip_compress_file(output_file_path)
                    output_file_path = output_file_path + ".gz"

                self.run.info('Output file for %s' % read_type,
                              output_file_path,
                              progress=self.progress)

            self.progress.end()
            self.run.info('Num paired-end reads stored',
                          pp(len(short_reds_for_splits_dict['R1'])),
                          mc='green',
                          nl_before=1)
            self.run.info('Num unpaired reads stored',
                          pp(len(short_reds_for_splits_dict['UNPAIRED'])),
                          mc='green')
        else:
            output_file_path = self.output_file_path or 'short_reads.fa'
            utils.store_dict_as_FASTA_file(short_reds_for_splits_dict['all'],
                                           output_file_path)

            if self.gzip:
                utils.gzip_compress_file(output_file_path)
                output_file_path = output_file_path + ".gz"

            self.progress.end()
            self.run.info('Output file for all short reads', output_file_path)
            self.run.info('Num reads stored',
                          pp(len(short_reds_for_splits_dict['all'])),
                          mc='green')
コード例 #4
0
ファイル: bamops.py プロジェクト: meren/anvio
    def store_short_reads_for_splits(self):
        self.sanity_check()

        if not self.sanity_checked:
            raise ConfigError("store_short_reads_for_splits :: Cannot be called before running sanity_check")

        short_reds_for_splits_dict = self.get_short_reads_for_splits_dict()

        self.progress.new("Storing reads")
        self.progress.update("...")

        if self.split_R1_and_R2:
            for read_type in sorted(list(short_reds_for_splits_dict.keys())):
                output_file_path = '%s_%s.fa' % (self.output_file_prefix, read_type)

                utils.store_dict_as_FASTA_file(short_reds_for_splits_dict[read_type], output_file_path)
                if self.gzip:
                    utils.gzip_compress_file(output_file_path)
                    output_file_path = output_file_path + ".gz"

                self.run.info('Output file for %s' % read_type, output_file_path, progress=self.progress)

            self.progress.end()
            self.run.info('Num paired-end reads stored',pp(len(short_reds_for_splits_dict['R1'])), mc='green', nl_before=1)
            self.run.info('Num unpaired reads stored',pp(len(short_reds_for_splits_dict['UNPAIRED'])), mc='green')
        else:
            output_file_path = self.output_file_path or 'short_reads.fa'
            utils.store_dict_as_FASTA_file(short_reds_for_splits_dict['all'], output_file_path)

            if self.gzip:
                utils.gzip_compress_file(output_file_path)
                output_file_path = output_file_path + ".gz"

            self.progress.end()
            self.run.info('Output file for all short reads',output_file_path)
            self.run.info('Num reads stored', pp(len(short_reds_for_splits_dict['all'])), mc='green')
コード例 #5
0
    def process(self):
        self.sanity_check()

        output_fasta = {}
        output_gene_calls = {}
        output_functions = {}
        num_genbank_records_processed = 0
        num_genes_found = 0
        num_genes_reported = 0
        num_genes_with_functions = 0

        try:
            if self.input_genbank_path.endswith('.gz'):
                genbank_file_object = SeqIO.parse(
                    io.TextIOWrapper(gzip.open(self.input_genbank_path, 'r')),
                    "genbank")
            else:
                genbank_file_object = SeqIO.parse(
                    open(self.input_genbank_path, "r"), "genbank")
        except Exception as e:
            raise ConfigError(
                "Someone didn't like your unput 'genbank' file :/ Here's what they said "
                "about it: '%s'." % e)

        for genbank_record in genbank_file_object:
            num_genbank_records_processed += 1
            output_fasta[genbank_record.name] = str(genbank_record.seq)

            genes = [
                gene for gene in genbank_record.features if gene.type == "CDS"
            ]  # focusing on features annotated as "CDS" by NCBI's PGAP

            for gene in genes:
                num_genes_found += 1
                location = str(gene.location)
                # dumping gene if "location" section contains any of these terms set above: "join" means the gene call spans multiple contigs; "<" or ">" means the gene call runs off a contig
                if any(exclusion_term in location
                       for exclusion_term in self.location_terms_to_exclude):
                    continue

                if "note" in gene.qualifiers:
                    note = str(gene.qualifiers["note"][0])

                    # dumping gene if noted as any of these in the "note" section set above
                    if any(exclusion_term in note
                           for exclusion_term in self.note_terms_to_exclude):
                        continue

                # dumping if overlapping translation frame
                if "transl_except" in gene.qualifiers:
                    continue

                # dumping if gene declared a pseudogene
                if "pseudo" in gene.qualifiers or "pseudogene" in gene.qualifiers:
                    continue

                # cleaning up gene coordinates to more easily parse:
                location = location.replace("[", "")
                location = re.sub('](.*)', '', location)
                location = location.split(":")

                start = location[0]  # start coordinate
                end = location[1]  # end coordinate

                # setting direction to "f" or "r":
                if gene.strand == 1:
                    direction = "f"
                else:
                    direction = "r"

                # for accession, storing protein id if it has one, else the the locus tag, else "None"
                if "protein_id" in gene.qualifiers:
                    accession = gene.qualifiers["protein_id"][0]
                elif "locus_tag" in gene.qualifiers:
                    accession = gene.qualifiers["locus_tag"][0]
                else:
                    accession = "None"

                # storing gene product annotation if present
                if "product" in gene.qualifiers:
                    function = gene.qualifiers["product"][0]
                    # trying to capture all different ways proteins are listed as hypothetical and setting to same thing so can prevent from adding to output functions table below
                    if function in [
                            "hypothetical", "hypothetical protein",
                            "conserved hypothetical",
                            "conserved hypotheticals",
                            "Conserved hypothetical protein"
                    ]:
                        function = "hypothetical protein"
                else:
                    function = "hypothetical protein"

                # if present, adding gene name to product annotation (so long as not a hypothetical, sometimes these names are useful, sometimes they are not):
                if "gene" in gene.qualifiers:
                    if function not in "hypothetical protein":
                        gene_name = str(gene.qualifiers["gene"][0])
                        function = function + " (" + gene_name + ")"

                output_gene_calls[self.gene_callers_id] = {
                    'contig': genbank_record.name,
                    'start': start,
                    'stop': end,
                    'direction': direction,
                    'partial': 0,
                    'call_type': 1,
                    'source': self.source,
                    'version': self.version
                }
                num_genes_reported += 1

                # not writing gene out to functions table if no annotation
                if "hypothetical protein" not in function:
                    output_functions[self.gene_callers_id] = {
                        'source': self.source,
                        'accession': accession,
                        'function': function,
                        'e_value': 0
                    }
                    num_genes_with_functions += 1

                # increment the gene callers id fo rthe next
                self.gene_callers_id += 1

        if num_genbank_records_processed == 0:
            raise ConfigError(
                "It seems there was no records in your input genbank file :/ Are you sure you "
                "gave the right file path that actually resolves to a genbank formatted "
                "text file?")

        self.run.info('Num GenBank entries processed',
                      num_genbank_records_processed)
        self.run.info('Num gene records found', num_genes_found)
        self.run.info('Num genes reported', num_genes_reported, mc='green')
        self.run.info('Num genes with functions',
                      num_genes_with_functions,
                      mc='green',
                      nl_after=1)

        # time to write these down:
        utils.store_dict_as_FASTA_file(output_fasta,
                                       self.output_fasta_path,
                                       wrap_from=None)
        self.run.info('FASTA file path', self.output_fasta_path)

        if len(output_gene_calls):
            utils.store_dict_as_TAB_delimited_file(output_gene_calls,
                                                   self.output_gene_calls_path,
                                                   headers=[
                                                       "gene_callers_id",
                                                       "contig", "start",
                                                       "stop", "direction",
                                                       "partial", "call_type",
                                                       "source", "version"
                                                   ])
            self.run.info('External gene calls file',
                          self.output_gene_calls_path)

            utils.store_dict_as_TAB_delimited_file(output_functions,
                                                   self.output_functions_path,
                                                   headers=[
                                                       'gene_callers_id',
                                                       'source', 'accession',
                                                       'function', 'e_value'
                                                   ])
            self.run.info('TAB-delimited functions',
                          self.output_functions_path)
        else:
            self.output_gene_calls_path = None
            self.output_functions_path = None
            self.run.warning(
                "Anvi'o couldn't find any gene calles in the GenBank file, hence you will get "
                "no output files for external gene calls or functions :/ We hope you can "
                "survive this terrible terrible news :(")

        self.run.info_single('Mmmmm ☘ ', nl_before=1, nl_after=1)

        return {
            'external_gene_calls': self.output_gene_calls_path,
            'gene_functional_annotation': self.output_functions_path,
            'path': self.output_fasta_path
        }