def write_to_genbank(self, filename: str = None, directory: str = None, record: SeqRecord = None) -> None: """ Writes a genbank file containing only the information contained within the Region. """ if not filename: filename = "%s.region%03d.gbk" % (self.parent_record.id, self.get_region_number()) if directory: filename = os.path.join(directory, filename) if record is None: record = self.parent_record.to_biopython() assert isinstance(record, SeqRecord) with warnings.catch_warnings(): warnings.simplefilter("ignore") cluster_record = record[self.location.start:self.location.end] cluster_record.annotations["date"] = record.annotations.get("date", '') cluster_record.annotations["source"] = record.annotations.get("source", '') cluster_record.annotations["organism"] = record.annotations.get("organism", '') cluster_record.annotations["taxonomy"] = record.annotations.get("taxonomy", []) cluster_record.annotations["data_file_division"] = record.annotations.get("data_file_division", 'UNK') cluster_record.annotations["comment"] = record.annotations.get("comment", '') # update the antiSMASH annotation to include some cluster details comment_end_marker = "##antiSMASH-Data-END" cluster_comment = ("NOTE: This is a single cluster extracted from a larger record!\n" "Orig. start :: {start}\n" "Orig. end :: {end}\n" "{end_marker}").format(start=self.location.start, end=self.location.end, end_marker=comment_end_marker) original = cluster_record.annotations["comment"] cluster_record.annotations["comment"] = original.replace(comment_end_marker, cluster_comment) # our cut-out clusters are always linear cluster_record.annotations["topology"] = "linear" # renumber clusters, superclusters and regions to reflect changes first_supercluster = min(sc.get_supercluster_number() for sc in self.superclusters) first_cluster = min(cluster.get_cluster_number() for cluster in self.get_unique_clusters()) first_subregion = min(sub.get_subregion_number() for sub in self.subregions) if self.subregions else 0 for feature in cluster_record.features: if feature.type == "region": supers = feature.qualifiers.get("candidate_cluster_numbers") if not supers: continue feature.qualifiers["candidate_cluster_numbers"] = [str(int(num) - first_supercluster) for num in supers] elif feature.type == SuperCluster.FEATURE_TYPE: new = str(int(feature.qualifiers["candidate_cluster_number"][0]) - first_supercluster) feature.qualifiers["candidate_cluster_number"] = [new] new_clusters = [str(int(num) - first_cluster) for num in feature.qualifiers["protoclusters"]] feature.qualifiers["protoclusters"] = new_clusters elif feature.type in ["protocluster", "protocluster_core"]: new = str(int(feature.qualifiers["protocluster_number"][0]) - first_cluster) feature.qualifiers["cluster_number"] = [new] elif feature.type == "subregion": new = str(int(feature.qualifiers["subregion_number"][0]) - first_subregion) feature.qualifiers["subregion_number"] = [new] seqio.write([cluster_record], filename, 'genbank')
def run_prodigal(record: Record, options: ConfigType) -> None: """ Run progidal to annotate prokaryotic sequences """ if "basedir" in options.get('prodigal', ''): basedir = options.prodigal.basedir else: basedir = "" with TemporaryDirectory(change=True): name = record.id.lstrip('-') if not name: name = "unknown" fasta_file = '%s.fasta' % name result_file = '%s.predict' % name with open(fasta_file, 'w') as handle: seqio.write([record.to_biopython()], handle, 'fasta') # run prodigal prodigal = [path.join(basedir, 'prodigal')] prodigal.extend(['-i', fasta_file, '-f', 'sco', '-o', result_file]) if options.genefinding_tool == "prodigal-m" or len(record.seq) < 20000: prodigal.extend(['-p', 'meta']) err = execute(prodigal).stderr if err.find('Error') > -1: logging.error("Failed to run prodigal: %r", err) raise RuntimeError("prodigal error: %s" % err) found = 0 for line in open(result_file, 'r'): # skip first line if not line.startswith('>'): continue name, start_chunk, end_chunk, prodigal_strand = line[1:].rstrip( ).split("_") try: start = int(start_chunk) end = int(end_chunk) if prodigal_strand == "+": strand = 1 else: strand = -1 except ValueError: logging.error('Malformatted prodigal output line %r', line.rstrip()) continue if start > end: strand = -1 start, end = end, start loc = FeatureLocation(start - 1, end, strand=strand) translation = record.get_aa_translation_from_location(loc) feature = CDSFeature(loc, locus_tag='ctg%s_%s' % (record.record_index, name), translation=translation, translation_table=record.transl_table) record.add_cds_feature(feature) found += 1 logging.debug("prodigal found %d CDS features", found)
def write(seq_records, options): basename = seq_records[0].id output_name = path.join(options.outputfoldername, "%s.final.embl" % basename) logging.debug("Writing seq_records to %r" % output_name) if options.input_type == 'nucl': seqio.write(seq_records, output_name, 'embl')
def write_to_genbank(self, filename=None, directory=None, record=None): """ Writes a genbank file containing only the information contained within the Cluster. """ if not filename: filename = "%s.cluster%03d.gbk" % (self.parent_record.id, self.get_cluster_number()) if directory: filename = os.path.join(directory, filename) if record is None: record = self.parent_record.to_biopython() assert isinstance(record, SeqRecord) with warnings.catch_warnings(): warnings.simplefilter("ignore") cluster_record = record[self.location.start:self.location.end] cluster_record.annotations["date"] = record.annotations.get("date", '') cluster_record.annotations["source"] = record.annotations.get( "source", '') cluster_record.annotations["organism"] = record.annotations.get( "organism", '') cluster_record.annotations["taxonomy"] = record.annotations.get( "taxonomy", []) cluster_record.annotations[ "data_file_division"] = record.annotations.get( "data_file_division", 'UNK') # our cut-out clusters are always linear cluster_record.annotations["topology"] = "linear" seqio.write([cluster_record], filename, 'genbank')
def write_search_fasta(record: Record) -> str: """ Constructs a FASTA representation of a record and writes it to a file in the current directory. Returns: the name of the file created """ filename = "{}.fasta".format(record.id) with open(filename, 'w') as handle: seqio.write([record.to_biopython()], handle, 'fasta') return filename
def write(seq_records, options): basename = seq_records[0].id if options.input_type == 'nucl': output_name = path.join(options.outputfoldername, "%s.final.gbk" % basename) logging.debug("Writing %s seq_records to %r" % (len(seq_records), output_name)) seqio.write(seq_records, output_name, 'genbank') i=1 for rec in seq_records: # For compatibility with the database importer, we have to check whether we are dealing # with a seq_record obtained from a file (then its class will be SeqRecord) or from a# # database (then its class will be DBSeqRecord) # # running the cluster extraction on a DBSeqRecord will throw an exception, as splitting the object is not supported if rec.__class__.__name__ == 'SeqRecord': for cluster in utils.get_cluster_features(rec): with warnings.catch_warnings(): warnings.simplefilter("ignore") cluster_rec = rec[cluster.location.start:cluster.location.end] output_name = path.join(options.outputfoldername, "%s.cluster%03d.gbk" % (basename, i)) seqio.write([cluster_rec], output_name, 'genbank') i += 1 else: seq_records = seq_record_convert_nucl_to_prot(seq_records, options) output_name = path.join(options.outputfoldername, "%s.final.gp" % basename) logging.debug("Writing seq_records to %r" % output_name) seqio.write(seq_records, output_name, 'genbank')
def write(seq_records, options): basename = seq_records[0].id if options.input_type == 'nucl': output_name = path.join(options.outputfoldername, "%s.final.gbk" % basename) for rec in seq_records: for cluster in utils.get_cluster_features(rec): with warnings.catch_warnings(): warnings.simplefilter("ignore") cluster_rec = rec[cluster.location.start:cluster.location. end] cluster_rec.annotations["date"] = rec.annotations.get( "date", '') cluster_rec.annotations["source"] = rec.annotations.get( "source", '') cluster_rec.annotations["organism"] = rec.annotations.get( "organism", '') cluster_rec.annotations["taxonomy"] = rec.annotations.get( "taxonomy", []) cluster_rec.annotations[ "data_file_division"] = rec.annotations.get( "data_file_division", 'UNK') # our cut-out clusters are always linear cluster_rec.annotations["topology"] = "linear" cluster_name = path.join( options.outputfoldername, "%s.cluster%03d.gbk" % (basename, utils.get_cluster_number(cluster))) seqio.write([cluster_rec], cluster_name, 'genbank') else: seq_records = seq_record_convert_nucl_to_prot(seq_records, options) output_name = path.join(options.outputfoldername, "%s.final.gp" % basename) logging.debug("Writing seq_records to %r" % output_name) seqio.write(seq_records, output_name, 'genbank')
def run_glimmer(seq_record, options): "Run glimmer3 to annotate prokaryotic sequences" basedir = utils.get_genefinding_basedir(options) with TemporaryDirectory(change=True): utils.fix_record_name_id(seq_record, options) name = seq_record.id while len(name) > 0 and name[0] == '-': name = name[1:] if name == "": name = "unknown" fasta_file = '%s.fasta' % name longorfs_file = '%s.longorfs' % name icm_file = '%s.icm' % name result_file = '%s.predict' % name # run long-orfs with open(fasta_file, 'w') as handle: seqio.write([seq_record], handle, 'fasta') long_orfs = [path.join(basedir, 'long-orfs')] long_orfs.extend([ '-l', '-n', '-t', '1.15', '--trans_table', '11', fasta_file, longorfs_file ]) out, err, _ = execute(long_orfs) if err.find('ERROR') > -1: logging.error("Locating long orfs failed: %r" % err) return # run extract extract = [ path.join(basedir, 'extract'), '-t', fasta_file, longorfs_file ] out, err, retcode = execute(extract) if out == '': logging.error("Failed to extract genes from model, aborting: %r" % err) return build_icm = [path.join(basedir, 'build-icm'), '-r', icm_file] out, err, retcode = execute(build_icm, input=out) if err != '': logging.error("Failed to build gene model: %r" % err) return # run glimmer3 glimmer = [path.join(basedir, 'glimmer3')] glimmer.extend([ '-l', '-o', '50', '-g', '90', '-q', '3000', '-t', '30', '--trans_table', '11', fasta_file, icm_file, name ]) out, err, retcode = execute(glimmer) if err.find('ERROR') > -1: logging.error("Failed to run glimmer3: %r" % err) return for line in open(result_file, 'r'): # skip first line if line.startswith('>'): continue name, start, end, strand, score = line.split() try: start = int(start) end = int(end) strand = int(strand) except ValueError: logging.error('Malformatted glimmer output line %r' % line.rstrip()) if start > end: bpy_strand = -1 tmp = start start = end end = tmp else: bpy_strand = 1 loc = FeatureLocation(start - 1, end, strand=bpy_strand) feature = SeqFeature(location=loc, id=name, type="CDS", qualifiers={ 'locus_tag': ['ctg%s_%s' % (options.record_idx, name)], 'note': ['Glimmer score: %s' % score] }) seq_record.features.append(feature)
def run_prodigal(seq_record, options): "Run progidal to annotate prokaryotic sequences" if "prodigal" in options: if "basedir" in options.prodigal: basedir = options.prodigal.basedir else: basedir = "" with TemporaryDirectory(change=True): utils.fix_record_name_id(seq_record, options) name = seq_record.id while len(name) > 0 and name[0] == '-': name = name[1:] if name == "": name = "unknown" fasta_file = '%s.fasta' % name result_file = '%s.predict' % name with open(fasta_file, 'w') as handle: seqio.write([seq_record], handle, 'fasta') # run prodigal prodigal = [path.join(basedir, 'prodigal')] prodigal.extend(['-i', fasta_file, '-f', 'sco', '-o', result_file]) if options.genefinding == "prodigal-m" or len(seq_record.seq) < 20000: prodigal.extend(['-p', 'meta']) err = execute(prodigal)[1] if err.find('Error') > -1: logging.error("Failed to run prodigal: %r" % err) return for line in open(result_file, 'r'): # skip first line if not line.startswith('>'): continue name, start, end, prodigalStrand = line[1:].rstrip().split("_") try: start = int(start) end = int(end) if prodigalStrand == "+": strand = 1 else: strand = -1 except ValueError: logging.error('Malformatted prodigal output line %r' % line.rstrip()) continue if start > end: strand = -1 tmp = start start = end end = tmp loc = FeatureLocation(start - 1, end, strand=strand) feature = SeqFeature(location=loc, id=name, type="CDS", qualifiers={ 'locus_tag': ['ctg%s_%s' % (options.record_idx, name)] }) seq_record.features.append(feature)
def test_write_calls_biopython(self): "Test writing Bio.SeqIO records" mock("Bio.SeqIO.write", tracker=self.tt, returns=[]) expected_trace = " Called Bio.SeqIO.write(['fake'], DummyHandle('test.gbk'), 'genbank')" seqio.write(['fake'], self.handle, "genbank") assert_same_trace(self.tt, expected_trace)
def run_glimmerhmm(seq_record, options): basedir = utils.get_genefinding_basedir(options) with TemporaryDirectory(change=True): #Write FASTA file and run GlimmerHMM utils.fix_record_name_id(seq_record, options) name = seq_record.id while len(name) > 0 and name[0] == '-': name = name[1:] if name == "": name = "unknown" fasta_file = '%s.fasta' % name result_file = '%s.predict' % name with open(fasta_file, 'w') as handle: seqio.write([seq_record], handle, 'fasta') glimmerhmm = ['glimmerhmm'] glimmerhmm.extend([ fasta_file, utils.get_full_path(__file__, "train_%s" % options.glimmerhmm_train_folder), "-g" ]) out, err, retcode = execute(glimmerhmm) if err.find('ERROR') > -1: logging.error("Failed to run GlimmerHMM: %r" % err) return #Parse GlimmerHMM predictions resultstext = out if "CDS" not in resultstext: logging.error("GlimmerHMM gene prediction failed: no genes found.") resultstext = resultstext.replace("\r", " ") lines = resultstext.split("\n") lines = lines[2:-1] orfnames = [] positions = [] strands = [] x = 0 orfnr = 0 starts = [] ends = [] for line in lines: columns = line.split("\t") if len(columns) > 1: if x == 0: if columns[6] == "+": bpy_strand = 1 else: bpy_strand = -1 if "mRNA" not in line: starts.append(int(columns[3])) ends.append(int(columns[4])) elif x == (len(lines) - 1) or "mRNA" in lines[x + 1]: if columns[6] == "+": bpy_strand = 1 else: bpy_strand = -1 strands.append(bpy_strand) starts.append(int(columns[3])) ends.append(int(columns[4])) orfnames.append("orf" + (5 - orfnr) * "0" + str(orfnr)) orfnr += 1 if len(starts) == 1: if starts[0] == 0: starts[0] = 1 if ends[0] == 0: ends[0] = 1 positions.append([[starts[0] - 1, ends[0]]]) else: pos = [] if bpy_strand == -1: starts.reverse() ends.reverse() for i in starts: if i == 0: i = 1 if ends[starts.index(i)] == 0: ends[starts.index(i)] = 1 pos.append([i - 1, ends[starts.index(i)]]) positions.append(pos) starts = [] ends = [] elif "mRNA" not in line: starts.append(int(columns[3])) ends.append(int(columns[4])) x += 1 if len(orfnames) == 0: logging.error("GlimmerHMM gene prediction failed. Please check the " \ "format of your input FASTA file.") #Create seq_record features for identified genes idx = 0 for orfname in orfnames: bpy_strand = strands[idx] genepositions = positions[idx] #For genes with only one CDS if len(genepositions) == 1: gstart, gend = genepositions[0] loc = FeatureLocation(gstart, gend, strand=bpy_strand) feature = SeqFeature( location=loc, id=orfname, type="CDS", qualifiers={ 'locus_tag': ['ctg%s_%s' % (options.record_idx, orfname)] }) seq_record.features.append(feature) #For genes with multiple exons else: gstart, gend = min(genepositions[0]), max(genepositions[-1]) sublocations = [] for exonstart, exonend in genepositions: exonloc = FeatureLocation(exonstart, exonend, strand=bpy_strand) sublocations.append(exonloc) loc = CompoundLocation(sublocations) feature = SeqFeature( location=loc, id=orfname, type="CDS", qualifiers={ 'locus_tag': ['ctg%s_%s' % (options.record_idx, orfname)] }) seq_record.features.append(feature) idx += 1
def write_to_genbank(self, filename: str = None, directory: str = None, record: SeqRecord = None) -> None: """ Writes a genbank file containing only the information contained within the Region. """ if not filename: filename = "%s.region%03d.gbk" % (self.parent_record.id, self.get_region_number()) if directory: filename = os.path.join(directory, filename) if record is None: record = self.parent_record.to_biopython() assert isinstance(record, SeqRecord) with warnings.catch_warnings(): warnings.simplefilter("ignore") cluster_record = record[self.location.start:self.location.end] cluster_record.annotations["date"] = record.annotations.get("date", '') cluster_record.annotations["source"] = record.annotations.get("source", '') cluster_record.annotations["organism"] = record.annotations.get("organism", '') cluster_record.annotations["taxonomy"] = record.annotations.get("taxonomy", []) cluster_record.annotations["data_file_division"] = record.annotations.get("data_file_division", 'UNK') cluster_record.annotations["comment"] = record.annotations.get("comment", '') # biopython does not persist the molecule_type annotation in slices, # despite it being required for output to the genbank format cluster_record.annotations["molecule_type"] = record.annotations["molecule_type"] # update the antiSMASH annotation to include some cluster details comment_end_marker = "##antiSMASH-Data-END" cluster_comment = ("NOTE: This is a single cluster extracted from a larger record!\n" "Orig. start :: {start}\n" "Orig. end :: {end}\n" "{end_marker}").format(start=self.location.start, end=self.location.end, end_marker=comment_end_marker) original = cluster_record.annotations["comment"] cluster_record.annotations["comment"] = original.replace(comment_end_marker, cluster_comment) # our cut-out clusters are always linear cluster_record.annotations["topology"] = "linear" # renumber clusters, candidate_clusters and regions to reflect changes # also update positions of RiPP component locations if self.candidate_clusters: first_candidate_cluster = min(sc.get_candidate_cluster_number() for sc in self.candidate_clusters) first_cluster = min(cluster.get_protocluster_number() for cluster in self.get_unique_protoclusters()) else: first_candidate_cluster = 0 first_cluster = 0 first_subregion = min(sub.get_subregion_number() for sub in self.subregions) if self.subregions else 0 for feature in cluster_record.features: if feature.type == Region.FEATURE_TYPE: candidates = feature.qualifiers.get("candidate_cluster_numbers") if not candidates: continue candidates = [str(int(num) - first_candidate_cluster + 1) for num in candidates] feature.qualifiers["candidate_cluster_numbers"] = candidates elif feature.type == CandidateCluster.FEATURE_TYPE: new = str(int(feature.qualifiers["candidate_cluster_number"][0]) - first_candidate_cluster + 1) feature.qualifiers["candidate_cluster_number"] = [new] new_clusters = [str(int(num) - first_cluster + 1) for num in feature.qualifiers["protoclusters"]] feature.qualifiers["protoclusters"] = new_clusters elif feature.type in ["protocluster", "proto_core"]: new = str(int(feature.qualifiers["protocluster_number"][0]) - first_cluster + 1) feature.qualifiers["protocluster_number"] = [new] elif feature.type == "subregion": new = str(int(feature.qualifiers["subregion_number"][0]) - first_subregion + 1) feature.qualifiers["subregion_number"] = [new] elif feature.type == "CDS_motif": for qual in ["leader_location", "tail_location"]: if qual not in feature.qualifiers: continue loc = location_from_string(feature.qualifiers[qual][0]) parts = [] for part in loc.parts: new_start = part.start - self.location.start new_end = part.end - self.location.start parts.append(FeatureLocation(new_start, new_end, part.strand)) feature.qualifiers[qual] = [str(build_location_from_others(parts))] seqio.write([cluster_record], filename, 'genbank')