def store_short_reads_for_splits(self): short_reds_for_splits_dict = self.get_short_reads_for_splits_dict() self.progress.new('Storing reads') self.progress.update('...') utils.store_dict_as_FASTA_file(short_reds_for_splits_dict, self.output_file_path) self.progress.end() self.run.info('Num reads stored', pp(len(short_reds_for_splits_dict))) self.run.info('FASTA output', self.output_file_path)
def store_short_reads_for_splits(self): short_reds_for_splits_dict = self.get_short_reads_for_splits_dict() self.progress.new('Storing reads') self.progress.update('...') utils.store_dict_as_FASTA_file(short_reds_for_splits_dict, self.output_file_path) self.progress.end() self.run.info('Num reads stored', pp(len(short_reds_for_splits_dict))) self.run.info('FASTA output', self.output_file_path)
def store_short_reads_for_splits(self): self.sanity_check() if not self.sanity_checked: raise ConfigError( "store_short_reads_for_splits :: Cannot be called before running sanity_check" ) short_reds_for_splits_dict = self.get_short_reads_for_splits_dict() self.progress.new("Storing reads") self.progress.update("...") if self.split_R1_and_R2: for read_type in sorted(list(short_reds_for_splits_dict.keys())): output_file_path = '%s_%s.fa' % (self.output_file_prefix, read_type) utils.store_dict_as_FASTA_file( short_reds_for_splits_dict[read_type], output_file_path) if self.gzip: utils.gzip_compress_file(output_file_path) output_file_path = output_file_path + ".gz" self.run.info('Output file for %s' % read_type, output_file_path, progress=self.progress) self.progress.end() self.run.info('Num paired-end reads stored', pp(len(short_reds_for_splits_dict['R1'])), mc='green', nl_before=1) self.run.info('Num unpaired reads stored', pp(len(short_reds_for_splits_dict['UNPAIRED'])), mc='green') else: output_file_path = self.output_file_path or 'short_reads.fa' utils.store_dict_as_FASTA_file(short_reds_for_splits_dict['all'], output_file_path) if self.gzip: utils.gzip_compress_file(output_file_path) output_file_path = output_file_path + ".gz" self.progress.end() self.run.info('Output file for all short reads', output_file_path) self.run.info('Num reads stored', pp(len(short_reds_for_splits_dict['all'])), mc='green')
def store_short_reads_for_splits(self): self.sanity_check() if not self.sanity_checked: raise ConfigError("store_short_reads_for_splits :: Cannot be called before running sanity_check") short_reds_for_splits_dict = self.get_short_reads_for_splits_dict() self.progress.new("Storing reads") self.progress.update("...") if self.split_R1_and_R2: for read_type in sorted(list(short_reds_for_splits_dict.keys())): output_file_path = '%s_%s.fa' % (self.output_file_prefix, read_type) utils.store_dict_as_FASTA_file(short_reds_for_splits_dict[read_type], output_file_path) if self.gzip: utils.gzip_compress_file(output_file_path) output_file_path = output_file_path + ".gz" self.run.info('Output file for %s' % read_type, output_file_path, progress=self.progress) self.progress.end() self.run.info('Num paired-end reads stored',pp(len(short_reds_for_splits_dict['R1'])), mc='green', nl_before=1) self.run.info('Num unpaired reads stored',pp(len(short_reds_for_splits_dict['UNPAIRED'])), mc='green') else: output_file_path = self.output_file_path or 'short_reads.fa' utils.store_dict_as_FASTA_file(short_reds_for_splits_dict['all'], output_file_path) if self.gzip: utils.gzip_compress_file(output_file_path) output_file_path = output_file_path + ".gz" self.progress.end() self.run.info('Output file for all short reads',output_file_path) self.run.info('Num reads stored', pp(len(short_reds_for_splits_dict['all'])), mc='green')
def process(self): self.sanity_check() output_fasta = {} output_gene_calls = {} output_functions = {} num_genbank_records_processed = 0 num_genes_found = 0 num_genes_reported = 0 num_genes_with_functions = 0 try: if self.input_genbank_path.endswith('.gz'): genbank_file_object = SeqIO.parse( io.TextIOWrapper(gzip.open(self.input_genbank_path, 'r')), "genbank") else: genbank_file_object = SeqIO.parse( open(self.input_genbank_path, "r"), "genbank") except Exception as e: raise ConfigError( "Someone didn't like your unput 'genbank' file :/ Here's what they said " "about it: '%s'." % e) for genbank_record in genbank_file_object: num_genbank_records_processed += 1 output_fasta[genbank_record.name] = str(genbank_record.seq) genes = [ gene for gene in genbank_record.features if gene.type == "CDS" ] # focusing on features annotated as "CDS" by NCBI's PGAP for gene in genes: num_genes_found += 1 location = str(gene.location) # dumping gene if "location" section contains any of these terms set above: "join" means the gene call spans multiple contigs; "<" or ">" means the gene call runs off a contig if any(exclusion_term in location for exclusion_term in self.location_terms_to_exclude): continue if "note" in gene.qualifiers: note = str(gene.qualifiers["note"][0]) # dumping gene if noted as any of these in the "note" section set above if any(exclusion_term in note for exclusion_term in self.note_terms_to_exclude): continue # dumping if overlapping translation frame if "transl_except" in gene.qualifiers: continue # dumping if gene declared a pseudogene if "pseudo" in gene.qualifiers or "pseudogene" in gene.qualifiers: continue # cleaning up gene coordinates to more easily parse: location = location.replace("[", "") location = re.sub('](.*)', '', location) location = location.split(":") start = location[0] # start coordinate end = location[1] # end coordinate # setting direction to "f" or "r": if gene.strand == 1: direction = "f" else: direction = "r" # for accession, storing protein id if it has one, else the the locus tag, else "None" if "protein_id" in gene.qualifiers: accession = gene.qualifiers["protein_id"][0] elif "locus_tag" in gene.qualifiers: accession = gene.qualifiers["locus_tag"][0] else: accession = "None" # storing gene product annotation if present if "product" in gene.qualifiers: function = gene.qualifiers["product"][0] # trying to capture all different ways proteins are listed as hypothetical and setting to same thing so can prevent from adding to output functions table below if function in [ "hypothetical", "hypothetical protein", "conserved hypothetical", "conserved hypotheticals", "Conserved hypothetical protein" ]: function = "hypothetical protein" else: function = "hypothetical protein" # if present, adding gene name to product annotation (so long as not a hypothetical, sometimes these names are useful, sometimes they are not): if "gene" in gene.qualifiers: if function not in "hypothetical protein": gene_name = str(gene.qualifiers["gene"][0]) function = function + " (" + gene_name + ")" output_gene_calls[self.gene_callers_id] = { 'contig': genbank_record.name, 'start': start, 'stop': end, 'direction': direction, 'partial': 0, 'call_type': 1, 'source': self.source, 'version': self.version } num_genes_reported += 1 # not writing gene out to functions table if no annotation if "hypothetical protein" not in function: output_functions[self.gene_callers_id] = { 'source': self.source, 'accession': accession, 'function': function, 'e_value': 0 } num_genes_with_functions += 1 # increment the gene callers id fo rthe next self.gene_callers_id += 1 if num_genbank_records_processed == 0: raise ConfigError( "It seems there was no records in your input genbank file :/ Are you sure you " "gave the right file path that actually resolves to a genbank formatted " "text file?") self.run.info('Num GenBank entries processed', num_genbank_records_processed) self.run.info('Num gene records found', num_genes_found) self.run.info('Num genes reported', num_genes_reported, mc='green') self.run.info('Num genes with functions', num_genes_with_functions, mc='green', nl_after=1) # time to write these down: utils.store_dict_as_FASTA_file(output_fasta, self.output_fasta_path, wrap_from=None) self.run.info('FASTA file path', self.output_fasta_path) if len(output_gene_calls): utils.store_dict_as_TAB_delimited_file(output_gene_calls, self.output_gene_calls_path, headers=[ "gene_callers_id", "contig", "start", "stop", "direction", "partial", "call_type", "source", "version" ]) self.run.info('External gene calls file', self.output_gene_calls_path) utils.store_dict_as_TAB_delimited_file(output_functions, self.output_functions_path, headers=[ 'gene_callers_id', 'source', 'accession', 'function', 'e_value' ]) self.run.info('TAB-delimited functions', self.output_functions_path) else: self.output_gene_calls_path = None self.output_functions_path = None self.run.warning( "Anvi'o couldn't find any gene calles in the GenBank file, hence you will get " "no output files for external gene calls or functions :/ We hope you can " "survive this terrible terrible news :(") self.run.info_single('Mmmmm ☘ ', nl_before=1, nl_after=1) return { 'external_gene_calls': self.output_gene_calls_path, 'gene_functional_annotation': self.output_functions_path, 'path': self.output_fasta_path }