def create_gene_feature(gene_name, feature_location, feature_qualifiers): """Creates a minimal SeqFeature to represent a gene. """ gene_feature = SeqFeature(feature_location, type='gene') gene_feature.qualifiers = {'gene': [gene_name]} gene_feature.qualifiers = dict(gene_feature.qualifiers.items() + feature_qualifiers.items()) return gene_feature
def write_gbk(results, query_dic, out_dir, out_prefix): rec_dic = {} results.sort(key=operator.itemgetter('qid')) for data in results: key = data['qid'] if key not in rec_dic: rec_dic[key] = [data] else: rec_dic[key].append(data) # list keys of the dic keys_list = list(rec_dic.keys()) # sort list of keys keys = sorted(keys_list) n = 0 for key in keys: records = rec_dic[key] rec = SeqRecord(Seq(str(query_dic[key].seq)), id=key, name=key, description='', annotations={"molecule_type": "DNA"}) for data in records: if test_cds(data) == 1: feature = SeqFeature(FeatureLocation(data['qstart'] - 1, data['qend'], strand=data['strand']), type='CDS', qualifiers={}) else: feature = SeqFeature(FeatureLocation(data['qstart'] - 1, data['qend'], strand=data['strand']), type='misc_feature', qualifiers={}) if 'qprot' in data: # feature.qualifiers = {'locus_tag':'{0}_{1}'.format(out_prefix, n), 'product':data['tid']. # split('::')[0], # 'note':description(data), 'translation':data['qprot']} feature.qualifiers = OrderedDict([('product', data['tid'].split('::')[0]), ('note', description(data)), ('translation', data['qprot'])]) else: # feature.qualifiers = {'locus_tag':'{0}_{1}'.format(out_prefix, n), # 'product':data['tid'].split('::')[0], # 'note':description(data)} feature.qualifiers = OrderedDict([('product', data['tid'].split('::')[0]), ('note', description(data))]) rec.features.append(feature) rec.features = sorted(rec.features, key=lambda feature: feature.location.start) for feature in rec.features: feature.qualifiers = OrderedDict([('locus_tag', f'DET_{n + 1}')] + list(feature.qualifiers.items())) n += 1 out_file = os.path.join(out_dir, f'{str(rec.id).replace(".", "-")}.gbk') with open(out_file, 'w') as out_f: SeqIO.write([rec], out_f, 'genbank')
def divide_genome(genome): output_genes = [] new_record = "" for chrom in genome: for feat in chrom.features: if feat.type == 'gene': if new_record: output_genes.append(new_record) record_sequence = feat.extract(chrom.seq) new_record = SeqRecord(record_sequence) new_record.id = feat.qualifiers['locus_tag'][0] new_record.description = chrom.description new_record.name = feat.qualifiers['locus_tag'][0] new_location = FeatureLocation( feat.location.start - feat.location.start, feat.location.end - feat.location.start) new_feat = SeqFeature(location=new_location, type=feat.type, strand=feat.strand, ref=feat.ref, ref_db=feat.ref_db) new_feat.qualifiers = feat.qualifiers new_record.features = [new_feat] new_record.annotations['topology'] = chrom.annotations[ 'topology'] new_record.annotations['date'] = chrom.annotations['date'] new_record.annotations['taxonomy'] = chrom.annotations[ 'taxonomy'] new_record.annotations['source'] = chrom.annotations['source'] new_record.annotations['organism'] = chrom.annotations[ 'organism'] new_record.annotations['sequence_version'] = chrom.annotations[ 'sequence_version'] new_record.annotations[ 'data_file_division'] = chrom.annotations[ 'data_file_division'] new_record.annotations['references'] = chrom.annotations[ 'references'] else: new_location = FeatureLocation( feat.location.start - feat.location.start, feat.location.end - feat.location.start) new_feat = SeqFeature(location=new_location, type=feat.type, strand=feat.strand, ref=feat.ref, ref_db=feat.ref_db) new_feat.qualifiers = feat.qualifiers new_record.features.append(new_feat) return output_genes
def parse(self): with open(self._file) as handle: genbank = SeqRecord(Seq.UnknownSeq(0)) header_pattern = re.compile(r"ref\|(?P<id>.*?)\|:(?P<start>[0-9]+)-(?P<end>[0-9]+)\|(?P<description>.*?)\|\s*\[gene=(?P<gene>\S+)\]\s*\[locus_tag=(?P<locus_tag>\S+)\]\s*") first = True for record in SeqIO.parse(handle, "fasta"): header = record.description match = header_pattern.match(header) if not match: self.errors.append("Invalid header: >" + header) continue if first: first = False genbank.id = match.group("id") genbank.name = match.group("id") feature = SeqFeature(FeatureLocation(int(match.group("start")), int(match.group("end"))), type = "gene") feature.qualifiers = {"locus_tag": match.group("locus_tag"), "gene": match.group("gene"), "note": match.group("description"), "sequence": record.seq} genbank.features.append(feature) return genbank return None
def get_genome_seqrecord_features(phage_genome): """Helper function that uses Genome data to populate the features SeqRecord atribute :param phage_genome: Input a Genome object. :type phage_genome: genome :returns: features is a list of SeqFeature objects parsed from cds objects """ source_feature = SeqFeature(FeatureLocation(0, phage_genome.length), strand=1, type="source") source_feature.qualifiers = OrderedDict() source_feature.qualifiers["source"] = (f"{phage_genome.host_genus} phage " f"{phage_genome.name}") features = [source_feature] for phage_cds in phage_genome.cds_features: phage_cds.set_seqfeature(type="gene") features.append(phage_cds.seqfeature) phage_cds.set_seqfeature(type="CDS") features.append(phage_cds.seqfeature) for phage_trna in phage_genome.trna_features: phage_trna.set_seqfeature(type="gene") features.append(phage_trna.seqfeature) phage_trna.set_seqfeature() features.append(phage_trna.seqfeature) return features
def parse(self): with open(self._file) as handle: genbank = SeqRecord(Seq.UnknownSeq(0)) header_pattern = re.compile( r"ref\|(?P<id>.*?)\|:(?P<start>[0-9]+)-(?P<end>[0-9]+)\|(?P<description>.*?)\|\s*\[gene=(?P<gene>\S+)\]\s*\[locus_tag=(?P<locus_tag>\S+)\]\s*" ) first = True for record in SeqIO.parse(handle, "fasta"): header = record.description match = header_pattern.match(header) if not match: self.errors.append("Invalid header: >" + header) continue if first: first = False genbank.id = match.group("id") genbank.name = match.group("id") feature = SeqFeature(FeatureLocation(int(match.group("start")), int(match.group("end"))), type="gene") feature.qualifiers = { "locus_tag": match.group("locus_tag"), "gene": match.group("gene"), "note": match.group("description"), "sequence": record.seq } genbank.features.append(feature) return genbank return None
def make_protein_feature(feature_name, feature_start, feature_end, feature_type): ''' Returns sequence feature, using start, end, name and type as input ''' feature = SeqFeature(FeatureLocation(int(feature_start), int(feature_end)), type=feature_type) if feature_type == "Region": feature.qualifiers = {'name': [feature_name]} return feature
def create_cluster_borders(anchor: str, clusters: List[ClusterPrediction], record: Record) -> List[ClusterBorder]: """ Create the predicted ClusterBorders """ if not clusters: return [] borders = [] for i, cluster in enumerate(clusters): # cluster borders returned by hmmdetect are based on CDS features # in contrast, cluster borders returned by cassis are based on gene features # --> hmmdetect derived clusters have exact loctions, like the CDSs have # --> cassis derived clusters may have fuzzy locations, like the genes have left_name = cluster.start.gene right_name = cluster.end.gene left = None right = None for gene in record.get_genes(): if gene.get_name() == left_name: left = gene if gene.get_name() == right_name: right = gene if left and right: break new_feature = SeqFeature(FeatureLocation(left.location.start, right.location.end), type="cluster_border") new_feature.qualifiers = { "aStool": ["cassis"], "anchor": [anchor], "abundance": [cluster.start.abundance + cluster.end.abundance], "motif_score": ["{:.1e}".format(cluster.start.score + cluster.end.score)], "gene_left": [cluster.start.gene], "promoter_left": [cluster.start.promoter], "abundance_left": [cluster.start.abundance], "motif_left": [cluster.start.pairing_string], "motif_score_left": ["{:.1e}".format(cluster.start.score)], "gene_right": [cluster.end.gene], "promoter_right": [cluster.end.promoter], "abundance_right": [cluster.end.abundance], "motif_right": [cluster.end.pairing_string], "motif_score_right": ["{:.1e}".format(cluster.end.score)], "genes": [cluster.genes], "promoters": [cluster.promoters], } if i == 0: new_feature.qualifiers["note"] = [ "best prediction (most abundant) for anchor gene {}".format( anchor) ] else: new_feature.qualifiers["note"] = [ "alternative prediction ({}) for anchor gene {}".format( i, anchor) ] new_feature = ClusterBorder.from_biopython(new_feature) borders.append(new_feature) return borders
def _get_feature(self, feature_dict): """Retrieve a Biopython feature from our dictionary representation. """ location = FeatureLocation(*feature_dict['location']) new_feature = SeqFeature(location, feature_dict['type'], id=feature_dict['id'], strand=feature_dict['strand']) new_feature.qualifiers = feature_dict['quals'] return new_feature
def feature_intervals_to_features( features: List[FeatureInterval], strand: Strand, force_strand: bool, feature_name: Optional[str] = None, locus_tag: Optional[str] = None, ) -> Iterable[SeqFeature]: """Converts a :class:`~biocantor.gene.feature.FeatureInterval` to a :class:`Bio.SeqFeature.SeqFeature`. :class:`Bio.SeqFeature.SeqFeature` are BioPython objects that will then be used to write to a GenBank file. There is one :class:`Bio.SeqFeature.SeqFeature` for every feature, or row group, in the output file. There will be one joined interval at the transcript level representing the exonic structure. While transcript members of a gene can have different strands, for GenBank files that is not allowed. This function will explicitly force the strand and provide a warning that this is happening. Args: features: A list of :class:`~biocantor.gene.feature.TranscriptInterval`. strand: ``Strand`` that this gene lives on. force_strand: Boolean flag; if ``True``, then strand is forced, if ``False``, then improper strands are instead skipped. feature_name: An optional feature name. locus_tag: An optional locus tag. Yields: A ``SeqFeature``s for each feature. """ for feature in features: location = feature._location.to_biopython() feature_qualifiers = { key: list(vals) for key, vals in feature.export_qualifiers().items() } if feature_name: feature_qualifiers["gene"] = [feature_name] if locus_tag: feature_qualifiers["locus_tag"] = [locus_tag] if location.strand != strand.value: warn_str = f"Found strand mismatch between gene and feature on feature {feature}. " if force_strand: warn_str += "Forcing this transcript to the gene orientation." warnings.warn(warn_str, StrandViolationWarning) else: warn_str += "Skipping this transcript." warnings.warn(warn_str, StrandViolationWarning) continue feature = SeqFeature( location, type=FeatureIntervalFeatures.FEATURE_INTERVAL.value, strand=strand.value) feature.qualifiers = feature_qualifiers.copy() yield feature
def make_seq_feature(start, end, ftype, quals={}): ''' create a sequence feature from a start, end, and a type. additionally you may include other fields, like note, label, evidence, citation, as a dict. ''' seq_feature = SeqFeature(FeatureLocation(start, end), strand= +1, type=ftype) seq_feature.qualifiers = quals seq_feature.qualifiers['source'] = ['splicemod'] return seq_feature
def _get_feature(self, feature_dict): """Retrieve a Biopython feature from our dictionary representation.""" location = FeatureLocation(*feature_dict["location"]) new_feature = SeqFeature( location, feature_dict["type"], id=feature_dict["id"], strand=feature_dict["strand"], ) new_feature.qualifiers = feature_dict["quals"] return new_feature
def _make_fake_feature(start, end, probability=None, pfam_id=None, type_=None): location = FeatureLocation(start, end) feature = SeqFeature(location) feature.qualifiers = {'note': [], 'db_xref': []} if probability is not None: feature.qualifiers['note'].append('ClusterFinder probability: %02.4f' % probability) if pfam_id is not None: feature.qualifiers['db_xref'].append('PFAM: %s' % pfam_id) if type_ is not None: feature.type = type_ return feature
def check_sub(feature, sequence) -> List[SeqFeature]: """ Recursively checks a GFF feature for any subfeatures and generates any appropriate SeqFeature instances from them. """ new_features = [] locations = [] # type: List[FeatureLocation] trans_locations = [] # type: List[FeatureLocation] qualifiers = {} # type: Dict[str, List[str]] mismatching_qualifiers = set() # type: Set[str] for sub in feature.sub_features: if sub.sub_features: # If there are sub_features, go deeper new_features.extend(check_sub(sub, sequence)) elif sub.type == 'CDS': sub_mismatch = generate_details_from_subfeature( sub, qualifiers, locations, trans_locations) mismatching_qualifiers.update(sub_mismatch) for qualifier in mismatching_qualifiers: del qualifiers[qualifier] if 'Parent' in qualifiers: del qualifiers['Parent'] # if nothing to work on if not new_features and not locations: return [] # Only works in tip of the tree, when there's no new_feature built yet. If there is, # it means the script just came out of a check_sub and it's ready to return. if not new_features: new_loc = locations[0] # construct a compound location if required if len(locations) > 1: locations = sorted(locations, key=lambda x: x.start.real) trans_locations = sorted(trans_locations, key=lambda x: x.start.real) if locations[0].strand == 1: new_loc = CompoundLocation(locations) else: new_loc = CompoundLocation(list(reversed(locations))) trans_locations = list(reversed(trans_locations)) # TODO: use new secmet features new_feature = SeqFeature(new_loc) new_feature.qualifiers = qualifiers new_feature.type = 'CDS' trans = ''.join([ n.extract(sequence.seq).translate(stop_symbol='')._data for n in trans_locations ]) new_feature.qualifiers['translation'] = [str(trans)] new_features.append(new_feature) return new_features
def _add_gff_line(self, rec, gff_parts, parents, children): """Add details from a GFF line to the given SeqRecord. """ gff_parts = [(None if p == '.' else p) for p in gff_parts] assert rec.id == gff_parts[0], "ID mismatch: %s %s" % (rec.id, gff_parts[0]) # collect all of the base qualifiers for this item quals = collections.defaultdict(list) if gff_parts[1]: quals["source"].append(gff_parts[1]) if gff_parts[5]: quals["score"].append(gff_parts[5]) if gff_parts[7]: quals["phase"].append(gff_parts[7]) for key, val in [a.split('=') for a in gff_parts[8].split(';')]: quals[key].extend(val.split(',')) quals = dict(quals) # if we are describing a location, then we are a feature if gff_parts[3] and gff_parts[4]: #if quals.has_key('ID') or quals.has_key('Parent'): # print gff_parts[1:6], quals location = FeatureLocation( int(gff_parts[3]) - 1, int(gff_parts[4])) new_feature = SeqFeature(location, gff_parts[2], id=quals.get('ID', [''])[0], strand=self._strand_map[gff_parts[6]]) new_feature.qualifiers = quals # Handle flat features if not new_feature.id: rec.features.append(new_feature) # features that have parents need to link so we can pick up # the relationship elif new_feature.qualifiers.has_key('Parent'): for parent in new_feature.qualifiers['Parent']: children[parent].append(new_feature) # top level features else: parents[rec.id].append(new_feature) # otherwise, associate these annotations with the full record else: # add these as a list of annotations, checking not to overwrite # current values for key, vals in quals: if rec.annotations.has_key(key): try: rec.annotations[key].extend(vals) except AttributeError: rec.annotations[key] = [rec.annotations[key]] + vals else: rec.annotations[key] = vals return rec, parents, children
def attach_features(predictions, seqrecord): for prediction in predictions[seqrecord.id]: if prediction.raw_score >= 1.0: qualifiers = {} qualifiers['locus_tag'] = [prediction.cds_id] feature = SeqFeature( location=prediction.location, type='CDS', strand=prediction.strand, qualifiers=qualifiers, ) feature.qualifiers = qualifiers seqrecord.features.append(feature)
def _add_gff_line(self, rec, gff_parts, parents, children): """Add details from a GFF line to the given SeqRecord. """ gff_parts = [(None if p == '.' else p) for p in gff_parts] assert rec.id == gff_parts[0], "ID mismatch: %s %s" % (rec.id, gff_parts[0]) # collect all of the base qualifiers for this item quals = collections.defaultdict(list) if gff_parts[1]: quals["source"].append(gff_parts[1]) if gff_parts[5]: quals["score"].append(gff_parts[5]) if gff_parts[7]: quals["phase"].append(gff_parts[7]) for key, val in [a.split('=') for a in gff_parts[8].split(';')]: quals[key].extend(val.split(',')) quals = dict(quals) # if we are describing a location, then we are a feature if gff_parts[3] and gff_parts[4]: #if quals.has_key('ID') or quals.has_key('Parent'): # print gff_parts[1:6], quals location = FeatureLocation(int(gff_parts[3]) - 1, int(gff_parts[4])) new_feature = SeqFeature(location, gff_parts[2], id = quals.get('ID', [''])[0], strand = self._strand_map[gff_parts[6]]) new_feature.qualifiers = quals # Handle flat features if not new_feature.id: rec.features.append(new_feature) # features that have parents need to link so we can pick up # the relationship elif new_feature.qualifiers.has_key('Parent'): for parent in new_feature.qualifiers['Parent']: children[parent].append(new_feature) # top level features else: parents[rec.id].append(new_feature) # otherwise, associate these annotations with the full record else: # add these as a list of annotations, checking not to overwrite # current values for key, vals in quals: if rec.annotations.has_key(key): try: rec.annotations[key].extend(vals) except AttributeError: rec.annotations[key] = [rec.annotations[key]] + vals else: rec.annotations[key] = vals return rec, parents, children
def nrpsSmash(dnaSeq): options = Namespace() options.outputfoldername = "/tmp/nrpspks_predictions_txt" options.record_idx = "" # used in NRPSPredictor2.nrpscodepred, check later what to set it to options.eukaryotic = 0 tstFeature = SeqFeature(FeatureLocation(0, len(dnaSeq)), type="CDS", strand=1) tstFeature.qualifiers = {'gene':['gene']} sequenceRecord = SeqRecord(Seq(dnaSeq, IUPAC.unambiguous_dna), id = "seq_id", name = "seq_name", description = "seq_description") sequenceRecord.features = [tstFeature] analysis = specific_analysis(sequenceRecord, options) shutil.rmtree(options.raw_predictions_outputfolder) return analysis
def check_sub(feature, sequence): new_features = [] loc_list = [] qual_list = {} topop = [] for sub in feature.sub_features: if sub.sub_features: # If there are sub_features, go deeper new_features.extend(check_sub(sub, sequence)) elif sub.type == 'CDS': loc = [sub.location.start.real, sub.location.end.real] loc_list.append(FeatureLocation(loc[0], loc[1], strand=sub.strand)) # For split features (CDSs), the final feature will have the same qualifiers as the children ONLY if # they're the same, i.e.: all children have the same "protein_ID" (key and value). for qual in sub.qualifiers.keys(): if qual not in qual_list: qual_list[qual] = sub.qualifiers[qual] if qual in qual_list and not qual_list[qual] == sub.qualifiers[ qual]: topop.append(qual) for n in topop: # Pop mismatching qualifers over split features qual_list.pop(n, None) qual_list.pop('Parent', None) # Pop parent. # Only works in tip of the tree, when there's no new_feature built yet. If there is, # it means the script just came out of a check_sub and it's ready to return. if not new_features: if len(loc_list) > 1: loc_list = sorted(loc_list, key=lambda x: x.start.real) if loc_list[0].strand == 1: new_loc = CompoundLocation(loc_list) else: new_loc = CompoundLocation(list(reversed(loc_list))) elif len(loc_list) == 0: return new_features else: new_loc = loc_list[0] new_feature = SeqFeature(new_loc) new_feature.qualifiers = qual_list new_feature.type = 'CDS' trans = new_feature.extract(sequence.seq).translate(stop_symbol='') new_feature.qualifiers['translation'] = [str(trans)] new_features.append(new_feature) return new_features
def nrpsSmash(dnaSeq): options = Namespace() options.outputfoldername = "/tmp/nrpspks_predictions_txt" options.record_idx = "" # used in NRPSPredictor2.nrpscodepred, check later what to set it to options.eukaryotic = 0 tstFeature = SeqFeature(FeatureLocation(0, len(dnaSeq)), type="CDS", strand=1) tstFeature.qualifiers = {'gene': ['gene']} sequenceRecord = SeqRecord(Seq(dnaSeq, IUPAC.unambiguous_dna), id="sauce", name="bbqSauce", description="wtfDude") sequenceRecord.features = [tstFeature] analysis = specific_analysis(sequenceRecord, options) shutil.rmtree(options.raw_predictions_outputfolder) return analysis
def parse_smart_domains(file_name): in_handle = open(file_name, "rU") lines = in_handle.readlines() domains = [] domain_status = False domain_type = False is_domain = False for line in lines: line = line.rstrip() if len(line) > 1: pairs = line.split("=") is_domain = True if len(pairs) == 2: if pairs[0] == "DOMAIN": domain_name = pairs[1] elif pairs[0] == "START": domain_start = int(pairs[1]) elif pairs[0] == "END": domain_end = int(pairs[1]) elif pairs[0] == "TYPE": if pairs[1] != "PFAM": domain_type = True else: domain_type = False elif pairs[0] == "STATUS": if pairs[1] == "visible|OK": domain_status = True #False else: domain_status = True else: is_domain = False else: if is_domain & domain_type & domain_status: d = SeqFeature(FeatureLocation(domain_start, domain_end), type="Region") d.qualifiers = {'region_name': [domain_name]} if domain_name != 'low_complexity_region': domains.append(d) is_domain = False domain_type = False domain_status = False in_handle.close() return domains
def store_promoters(promoters: Iterable[Promoter], record: Record) -> None: """Store information about promoter sequences to a SeqRecord""" for promoter in promoters: # remember to account for 0-indexed start location new_feature = SeqFeature(FeatureLocation(max(0, promoter.start - 1), promoter.end), type="promoter") new_feature.qualifiers = { "locus_tag": promoter.get_gene_names( ), # already a list with one or two elements "seq": [str(promoter.seq)], } if isinstance(promoter, CombinedPromoter): new_feature.qualifiers["note"] = ["bidirectional promoter"] secmet_version = Feature.from_biopython(new_feature) secmet_version.created_by_antismash = True record.add_feature(secmet_version)
def add_cds_feature( transcript: TranscriptInterval, transcript_qualifiers: Dict[Hashable, List[Hashable]], strand: Strand, translation_table: TranslationTable, update_translations: bool, ) -> SeqFeature: """ Converts a :class:`~biocantor.gene.transcript.TranscriptInterval` that has a CDS to a :class:`Bio.SeqFeature.SeqFeature`. that represents the spliced CDS interval. Args: transcript: A :class:`~biocantor.gene.transcript.TranscriptInterval`. strand: ``Strand`` that this transcript lives on. transcript_qualifiers: Qualifiers dictionary from the transcript level feature. translation_table: Translation table to use. update_translations: Should the /translation tag be calculated or re-calculated? This is a time consuming process. Returns: ``SeqFeature`` for the CDS of this transcript. """ location = transcript.cds._location.to_biopython() feature = SeqFeature(location, type=GeneIntervalFeatures.CDS.value, strand=strand.value) feature.qualifiers = transcript_qualifiers if update_translations: # if the sequence has N's, we cannot translate try: feature.qualifiers["translation"] = [ str( transcript.get_protein_sequence( translation_table=translation_table)) ] except ValueError: pass return feature
def parse_prodigal(infile, start_nr=1, name_base='prodigal_', prefix=''): feature_dict = {} all_names = {} with open(infile) as f: text = f.read() tabs = text.split('\n#') groups = [(tabs[i], tabs[i + 1]) for i in range(0, len(tabs), 2)] for header, data in groups: features = [] name_start = header.index('seqhdr=') scaffold_name = header[name_start + 7:].strip('"') if ' ' in scaffold_name: scaffold_name = scaffold_name.split(' ')[0] data_lines = data.split('\n') if '' in data_lines: data_lines.remove('') for line in data_lines[1:]: line_data = line[1:].split('_') gene_nr = line_data[0] start = int(line_data[1]) - 1 # To adjust to pythonic index end = int(line_data[2]) if line_data[3] == '+': strand = 1 else: strand = -1 gene_name = '%s%s%i' % (prefix, name_base, start_nr) if gene_name not in all_names: all_names[gene_name] = 0 all_names[gene_name] += 1 start_nr += 1 quals = {'locus_tag': [gene_name]} # Convert to SeqFeature fl = FeatureLocation(start, end, strand) sf = SeqFeature(fl, type='CDS', strand=strand) sf.qualifiers = quals features.append(sf) feature_dict[scaffold_name] = features return (feature_dict, start_nr, all_names)
def merge_cds(LocList, TransLocList, QualList, sequence): if len(LocList) > 1: LocList = sorted(LocList, key=lambda x: x.start.real) TransLocList = sorted(TransLocList, key=lambda x: x.start.real) if LocList[0].strand == 1: newLoc = CompoundLocation(LocList) else: newLoc = CompoundLocation(list(reversed(LocList))) TransLocList = reversed(TransLocList) elif len(LocList) == 0: return None else: newLoc = LocList[0] cur_feature = SeqFeature(newLoc) cur_feature.qualifiers = QualList cur_feature.type = 'CDS' trans = ''.join([ n.extract(sequence.seq).translate(stop_symbol='')._data for n in TransLocList ]) cur_feature.qualifiers['translation'] = [trans] return cur_feature
def pred_coil(seqr, params, fScore=None): ''' pred_coil(seq,seqLen,params,fScore) returns the coiled coil prediction of sequence seq''' seqr = copy.deepcopy(seqr) if fScore == None: fScore = seqScore seq = seqr.seq seqLen = len(seqr.seq) hept_pos=['a','b','c','d','e','f','g'] score=[0.0]*seqLen hept_seq=['x']*seqLen for i in range(seqLen-params.win+1): this_score=1.0 actual_win=0.0 for j in range(min(params.win,seqLen-i)): pos=j%7 actual_win+=params.pow[pos] this_score*=math.pow( fScore(params.mat,seq,i+j,pos), params.pow[pos] ) if actual_win > 0: this_score=math.pow(this_score,1/actual_win) else: this_score=0.0 for j in range(min(params.win,seqLen-i)): pos=j%7 if this_score > score[i+j]: score[i+j]=this_score hept_seq[i+j]=hept_pos[pos] for i in range(seqLen): gg, gcc, prob = coilProb(score[i],params) seqf = SeqFeature(location=FeatureLocation(i,i), type="pscoils") seqf.qualifiers = {'gg':gg, 'gcc': gcc, 'prob': prob, 'score': score[i], 'hept_seq': hept_seq[i]} seqr.features.append(seqf) return seqr
def gene2features(r, gene, gene2position, gene2product, start, end, gcode, partialyes, verbose): """ """ contig, CDSs, gffstrand, function, frames = gene2position[gene] if gffstrand in ('1', '+'): strand = +1 else: strand = -1 CDSs.reverse() '''#add stop codon if not partial seq if strand==1 and CDSs[-1][1]+3 <= len(r.seq): CDSs[-1][1] += 3 elif strand==-1 and CDSs[0][0]-3 > 0: CDSs[0][0] -= 3''' cdsloc, mrnaloc = get_locations(CDSs, start, end, strand) #add gene geneid = gene #".".join(gene.split('.')[:-1]) #get product product = "hypothetical protein" if geneid in gene2product: product = gene2product[geneid] if gene.endswith('.t1'): sf = SeqFeature(FeatureLocation(BeforePosition(start - 1), AfterPosition(end)), strand=strand, type='gene', id=geneid) sf.qualifiers = { "locus_tag": geneid, "gene": geneid, "product": product } r.features.append(sf) #get mRNA sf sf = SeqFeature(mrnaloc, type='mRNA', id=gene) sf.qualifiers = { "locus_tag": geneid, "gene": geneid, "product": product } #"protein_id": gene r.features.append(sf) #get CDS sf sf = SeqFeature(cdsloc, type='CDS', id=gene) #get translation seq = sf.extract(r.seq) aa = str(seq.translate(table=gcode)) #solve non-triplets issue if len(seq) % 3: if strand == 1: end -= len(seq) % 3 else: start += len(seq) % 3 ##check for partial sequence - no M as first or no * as last aa partial = 0 #both ends partial if aa[0] != "M" and aa[-1] != "*": partial = 1 sf.location = FeatureLocation(BeforePosition(start - 1), AfterPosition(end)) #left end partial elif aa[0] != "M" and strand == 1 or aa[-1] != "*" and strand == -1: partial = 1 sf.location = FeatureLocation(BeforePosition(start - 1), end) #right end partial elif aa[-1] != "*" and strand == 1 or aa[0] != "M" and strand == -1: partial = 1 sf.location = FeatureLocation(start - 1, AfterPosition(end)) #strip stop codon aa = aa.strip("*") #replace internal stop codons by X if "*" in aa: if verbose: sys.stderr.write("[Warning] Stop codon(s) in: %s. Skipped!\n" % gene) return r #aa = aa.replace("*","X") sf.qualifiers = { 'transl_table': gcode, "locus_tag": geneid, "gene": geneid, "product": product, "translation": aa } #"protein_id": gene, if function: sf.qualifiers['note'] = function #inform about partial entries if partial: #skip if not partial are allowed if not partialyes: return r if aa[0] != "M": sf.qualifiers['codon_start'] = 1 sf.qualifiers['product'] += ", partial cds" if verbose: sys.stderr.write("[Warning] Partial sequence: %s\n" % (gene, )) #sys.stderr.write("[Warning] Partial sequence: %s %s\n" % (gene,sf)) #add to features r.features.append(sf) return r
def gene_to_feature( gene_or_feature: Union[GeneInterval, FeatureIntervalCollection], genbank_type: GenbankFlavor, force_strand: bool, translation_table: TranslationTable, update_translations: bool, ) -> Iterable[SeqFeature]: """Converts either a :class:`~biocantor.gene.collections.GeneInterval` or a :class:`~biocantor.gene.collections.FeatureIntervalCollection` to a :class:`Bio.SeqFeature.SeqFeature`. :class:`Bio.SeqFeature.SeqFeature` are BioPython objects that will then be used to write to a GenBank file. There is one :class:`Bio.SeqFeature.SeqFeature` for every feature, or row group, in the output file. There will be one contiguous interval at the Gene level. While :class:`~biocantor.gene.collections.GeneInterval` always has its interval on the plus strand, GenBank files assume that a Gene has an explicit strand. Therefore, this function picks the most common strand and forces it on all of its children. Args: gene_or_feature: A :class:`~biocantor.gene.collections.GeneInterval` or :class:`~biocantor.gene.collections.FeatureIntervalCollection`. genbank_type: Are we writing an prokaryotic or eukaryotic style GenBank file? force_strand: Boolean flag; if ``True``, then strand on children is forced, if ``False``, then improper strands are instead skipped. translation_table: Translation table to use. update_translations: Should the /translation tag be calculated or re-calculated? This is a time consuming process. Yields: ``SeqFeature``s, one for the gene, one for each child transcript, and one for each transcript's CDS if it exists. """ location = gene_or_feature._location.to_biopython() # update the strand by picking the most common strands = [child.strand for child in gene_or_feature] strand = max(strands, key=strands.count) qualifiers = { key: list(vals) for key, vals in gene_or_feature.export_qualifiers().items() } # do our best to ensure there is a /gene tag symbol = None if isinstance(gene_or_feature, GeneInterval): if gene_or_feature.gene_symbol: symbol = gene_or_feature.gene_symbol elif gene_or_feature.gene_id: symbol = gene_or_feature.gene_id feature_type = GeneFeatures.GENE.value else: if gene_or_feature.feature_collection_name: symbol = gene_or_feature.feature_collection_name elif gene_or_feature.feature_collection_id: symbol = gene_or_feature.feature_collection_id feature_type = FeatureCollectionFeatures.FEATURE_COLLECTION.value if symbol: qualifiers[feature_type] = [symbol] if gene_or_feature.locus_tag: qualifiers[KnownQualifiers.LOCUS_TAG.value] = gene_or_feature.locus_tag feature = SeqFeature(location, type=feature_type, strand=strand.value) feature.qualifiers = qualifiers yield feature if isinstance(gene_or_feature, GeneInterval): yield from transcripts_to_feature( gene_or_feature.transcripts, strand, genbank_type, force_strand, translation_table, symbol, gene_or_feature.locus_tag, update_translations, ) else: yield from feature_intervals_to_features( gene_or_feature.feature_intervals, strand, force_strand, symbol, gene_or_feature.locus_tag)
def transcripts_to_feature( transcripts: List[TranscriptInterval], strand: Strand, genbank_type: GenbankFlavor, force_strand: bool, translation_table: TranslationTable, gene_symbol: Optional[str] = None, locus_tag: Optional[str] = None, update_translations: bool = False, ) -> Iterable[SeqFeature]: """Converts a :class:`~biocantor.gene.transcripts.TranscriptInterval` to a :class:`Bio.SeqFeature.SeqFeature`. :class:`Bio.SeqFeature.SeqFeature` are BioPython objects that will then be used to write to a GenBank file. There is one :class:`Bio.SeqFeature.SeqFeature` for every feature, or row group, in the output file. There will be one joined interval at the transcript level representing the exonic structure. While transcript members of a gene can have different strands, for GenBank files that is not allowed. This function will explicitly force the strand and provide a warning that this is happening. In eukaryotic mode, this function will create mRNA features for coding genes, and biotype features for non-coding. Coding genes are then passed on to create CDS features. In prokaryotic mode, this function will only create biotype features for non-coding genes. Args: transcripts: A list of :class:`~biocantor.gene.transcript.TranscriptInterval`. strand: ``Strand`` that this gene lives on. genbank_type: Are we writing an prokaryotic or eukaryotic style GenBank file? force_strand: Boolean flag; if ``True``, then strand is forced, if ``False``, then improper strands are instead skipped. gene_symbol: An optional gene symbol. locus_tag: An optional locus tag. translation_table: Translation table to use. update_translations: Should the /translation tag be calculated or re-calculated? This is a time consuming process. Yields: ``SeqFeature``s, one for each transcript and then one for each CDS of the transcript, if it exists. """ for transcript in transcripts: location = transcript.chunk_relative_location.to_biopython() transcript_qualifiers = { key: list(vals) for key, vals in transcript.export_qualifiers().items() } if gene_symbol is not None: transcript_qualifiers[KnownQualifiers.GENE.value] = [gene_symbol] if locus_tag is not None: transcript_qualifiers[KnownQualifiers.LOCUS_TAG.value] = [ locus_tag ] if location.strand != strand.value: warn_str = f"Found strand mismatch between gene and transcript on transcript {transcript}. " if force_strand: warn_str += "Forcing this transcript to the gene orientation." warnings.warn(warn_str, StrandViolationWarning) else: warn_str += "Skipping this transcript." warnings.warn(warn_str, StrandViolationWarning) continue if transcript.transcript_type is not None and TranscriptFeatures.has_value( transcript.transcript_type.name): feat_type = TranscriptFeatures(transcript.transcript_type.name) # biotypes might be wrong, only trust the CDS interval elif transcript.is_coding: feat_type = TranscriptFeatures.CODING_TRANSCRIPT else: feat_type = TranscriptFeatures.MISC_RNA if feat_type == TranscriptFeatures.CODING_TRANSCRIPT and genbank_type == GenbankFlavor.PROKARYOTIC: # this is a coding gene in prokaryotic mode; skip straight to CDS yield add_cds_feature(transcript, transcript_qualifiers, strand, translation_table, update_translations) else: # build this feature; it could be a mRNA for eukaryotic, or non-coding for either prokaryotic or eukaryotic feature = SeqFeature(location, type=feat_type.value, strand=strand.value) feature.qualifiers = transcript_qualifiers.copy() # NCBI does not like protein_id on transcript level features if "protein_id" in feature.qualifiers: del feature.qualifiers["protein_id"] yield feature # only in eukaryotic mode for coding genes do we add a third layer if genbank_type == GenbankFlavor.EUKARYOTIC and feat_type == TranscriptFeatures.CODING_TRANSCRIPT: yield add_cds_feature(transcript, transcript_qualifiers, strand, translation_table, update_translations)
right_remains_end=currentsearch[2] rightremains_list=[rightremainseq,right_remains_start,right_remains_end] remainsearches.append([rightremainseq,right_remains_start,right_remains_end]) featureslist=[] for feat in gbhitslist: #print feat mystrand=feat[3]["frame"][0]*feat[3]["frame"][1] feature = SeqFeature(FeatureLocation(feat[0]-1,feat[1]), strand=mystrand,type=feat[2]) feature.qualifiers=feat[3] featureslist.append(feature) for f in featureslist: gbfile.features.append(f) outfilepath=contig_dir_path #print(outfilepath) tempfh=open(outfilepath,"w") SeqIO.write([gbfile],tempfh,"genbank") #print "treated "+str(contigcounter)+ " contigs" print "treated "+str(genomerowcounter)+ "genome rows"
if r0063 is not None: luxpl = next(get_features('Lux pL promoter')) luxpl.location = r0063.location gb_archive.features.remove(r0063) # add LVA ssrA tag ssra_match = SSRA_TAG.search(gb_archive.seq) if ssra_match is not None: ssra = SeqFeature(type="CDS") ssra.location = FeatureLocation(*ssra_match.span(), strand=1) ssra.qualifiers = { "label": ["ssrA tag (LVA)"], "product": [ "C-terminal peptide that mediates degradation in bacteria through the ClpXP and ClpAP proteases (McGinness et al., 2006)" ], "translation": "AANDENYALVA", "note": [ "mutant LVA variant that confers accelerated degradation under some conditions (Andersen et al., 1998)", "color: #cc99b2", ], } gb_archive.features.append(ssra) # Replace E0040m with well annotated GFP e0040m = next(get_features("E0040m"), None) if e0040m is not None: if any(get_features("GFP")): gb_archive.features.remove(next(get_features("GFP"))) e0040m.qualifiers.update(gfp.qualifiers) # Replace E1010m with well annotated mRFP
def gene2features(r, gene, gene2position, gene2product, start, end, gcode, partialyes, verbose): """ """ contig, CDSs, gffstrand, function, frames = gene2position[gene] if gffstrand in ('1','+'): strand = +1 else: strand = -1 CDSs.reverse() '''#add stop codon if not partial seq if strand==1 and CDSs[-1][1]+3 <= len(r.seq): CDSs[-1][1] += 3 elif strand==-1 and CDSs[0][0]-3 > 0: CDSs[0][0] -= 3''' cdsloc, mrnaloc = get_locations(CDSs, start, end, strand) #add gene geneid = gene #".".join(gene.split('.')[:-1]) #get product product = "hypothetical protein" if geneid in gene2product: product = gene2product[geneid] if gene.endswith('.t1'): sf = SeqFeature(FeatureLocation(BeforePosition(start-1),AfterPosition(end)), strand=strand, type='gene', id=geneid) sf.qualifiers={"locus_tag": geneid, "gene": geneid, "product": product} r.features.append(sf) #get mRNA sf sf = SeqFeature(mrnaloc, type='mRNA', id=gene) sf.qualifiers={"locus_tag": geneid, "gene": geneid, "product": product} #"protein_id": gene r.features.append(sf) #get CDS sf sf = SeqFeature(cdsloc, type='CDS', id=gene) #get translation seq = sf.extract(r.seq) aa = str(seq.translate(table=gcode)) #solve non-triplets issue if len(seq) % 3: if strand==1: end -= len(seq) % 3 else: start += len(seq) % 3 ##check for partial sequence - no M as first or no * as last aa partial = 0 #both ends partial if aa[0]!="M" and aa[-1]!="*": partial = 1 sf.location = FeatureLocation(BeforePosition(start-1),AfterPosition(end)) #left end partial elif aa[0]!="M" and strand==1 or aa[-1]!="*" and strand==-1: partial = 1 sf.location = FeatureLocation(BeforePosition(start-1),end) #right end partial elif aa[-1]!="*" and strand==1 or aa[0]!="M" and strand==-1: partial = 1 sf.location = FeatureLocation(start-1,AfterPosition(end)) #strip stop codon aa = aa.strip("*") #replace internal stop codons by X if "*" in aa: if verbose: sys.stderr.write("[Warning] Stop codon(s) in: %s. Skipped!\n" % gene) return r #aa = aa.replace("*","X") sf.qualifiers = {'transl_table': gcode, "locus_tag": geneid, "gene": geneid, "product": product, "translation": aa} #"protein_id": gene, if function: sf.qualifiers['note'] = function #inform about partial entries if partial: #skip if not partial are allowed if not partialyes: return r if aa[0]!="M": sf.qualifiers['codon_start'] = 1 sf.qualifiers['product'] += ", partial cds" if verbose: sys.stderr.write("[Warning] Partial sequence: %s\n" % (gene,)) #sys.stderr.write("[Warning] Partial sequence: %s %s\n" % (gene,sf)) #add to features r.features.append(sf) return r
def annotate_geneclusters(seq_record, options): """Re-annotate gene clusters in the seq_record""" pfam_features = utils.get_pfam_features(seq_record) cf_clusters = find_cf_clusters(pfam_features, seq_record, options) # Integrate ClusterFinder clusters with existing cluster features newclusters = [] cluster_features = utils.get_cluster_features(seq_record) secmet_cds_features = utils.get_secmet_cds_features(seq_record) for cf_cluster in cf_clusters: overlaps = False cf_type = "cf_putative" for cluster in cluster_features: if not utils.features_overlap(cf_cluster, cluster): continue overlaps = True # Get signature genes from antiSMASH-predicted cluster features_in_cluster = utils.get_cluster_cds_features( cluster, seq_record) cluster_sig_genes = [ gene for gene in secmet_cds_features if gene in features_in_cluster ] # Predict gene cluster borders using ClusterFinder if options.borderpredict: if ((cluster.location.end + cluster.location.start) / 2) in cf_cluster.location: # Make sure that antiSMASH signature genes are still included in the cluster for sig_gene in cluster_sig_genes: startpoint = min( [sig_gene.location.start, sig_gene.location.end]) endpoint = max( [sig_gene.location.start, sig_gene.location.end]) if cf_cluster.location.start > startpoint: cf_cluster.location = FeatureLocation( startpoint, cf_cluster.location.end) if cf_cluster.location.end < endpoint: cf_cluster.location = FeatureLocation( cf_cluster.location.start, endpoint) cluster_border = SeqFeature(cf_cluster.location, type="cluster_border") cluster_border.qualifiers = { "tool": ["clusterfinder"], "probability": [cf_cluster.probability], "note": ["best prediction"], } seq_record.features.append(cluster_border) elif cf_cluster.location.start < cluster.location.start and cf_cluster.location.end > cluster.location.end: cluster.location = cf_cluster.location elif cf_cluster.location.start < cluster.location.start: cluster.location = FeatureLocation(cf_cluster.location.start, cluster.location.end) elif cf_cluster.location.end > cluster.location.end: cluster.location = FeatureLocation(cluster.location.start, cf_cluster.location.end) cluster.qualifiers['probability'] = [ "%01.4f" % cf_cluster.probability ] if not overlaps and not ('borderpredict_only' in options and options.borderpredict_only): cf_cluster_CDSs = utils.get_cluster_cds_features( cf_cluster, seq_record) for CDS in cf_cluster_CDSs: if 'sec_met' in CDS.qualifiers: type_sec_met_qualifiers = [ feat for feat in CDS.qualifiers['sec_met'] if "Type: " in feat ] for qualifier in type_sec_met_qualifiers: if "cf_fatty_acid" in qualifier: if cf_type == "cf_putative": cf_type = "cf_fatty_acid" elif cf_type == "cf_saccharide": cf_type = "cf_fatty_acid-saccharide" if "cf_saccharide" in qualifier: if cf_type == "cf_putative": cf_type = "cf_saccharide" elif cf_type == "cf_fatty_acid": cf_type = "cf_fatty_acid-saccharide" new_cluster = SeqFeature(cf_cluster.location, type="cluster") new_cluster.qualifiers['product'] = [cf_type] new_cluster.qualifiers['probability'] = [ "%01.4f" % cf_cluster.probability ] newclusters.append(new_cluster) if len(newclusters): seq_record.features.extend(newclusters) renumber_clusters(seq_record, options)
def name_nrpspks(seq_record, pksnrpsvars, withinclustergenes, options): pksnrpsvars.nrpspkstypedict = {} for feature in withinclustergenes: k = utils.get_gene_id(feature) if not pksnrpsvars.domaindict.has_key(k): continue if pksnrpsvars.domaindict[k] == []: continue #structure of domaindict: domaindict[genename] = [[name,start,end,evalue,score],[name,start,end,evalue,score], etc.] domainlist = [] nrKSdomains = 0 for i in pksnrpsvars.domaindict[k]: domainlist.append(i[0]) if i[0] == "PKS_KS": nrKSdomains += 1 modKSscore = 0 traKSscore = 0 eneKSscore = 0 iterKSscore = 0 if pksnrpsvars.ksdomaindict.has_key(k): for i in pksnrpsvars.ksdomaindict[k]: if i[0] == "Trans-AT-KS": traKSscore += 1 if i[0] == "Modular-KS": modKSscore += 1 if i[0] == "Enediyne-KS": eneKSscore += 1 if i[0] == "Iterative-KS": iterKSscore += 1 if pksnrpsvars.domaindict.has_key(k): for i in pksnrpsvars.domaindict[k]: if "Cglyc" in domainlist and "Epimerization" in domainlist and "AMP-binding" in domainlist and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist: nrpspkstype = "Glycopeptide NRPS" elif ( "Condensation_LCL" in domainlist or "Condensation_DCL" in domainlist or "Condensation_Starter" in domainlist or "Cglyc" in domainlist or "Condensation_Dual" in domainlist ) and "AMP-binding" in domainlist and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist: nrpspkstype = "NRPS" elif ("Condensation_LCL" in domainlist or "Condensation_DCL" in domainlist or "Condensation_Starter" in domainlist or "Cglyc" in domainlist or "Condensation_Dual" in domainlist) or "AMP-binding" in domainlist and ( "PKS_KS" in domainlist or "PKS_AT" in domainlist): nrpspkstype = "Hybrid PKS-NRPS" elif ( "Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist ) and "PKS_KS" in domainlist and "PKS_AT" not in domainlist and "Trans-AT_docking" in domainlist and traKSscore > modKSscore and traKSscore > iterKSscore and traKSscore > eneKSscore: nrpspkstype = "Type I Trans-AT PKS" elif ( "Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist ) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and iterKSscore > modKSscore and iterKSscore > traKSscore and iterKSscore > eneKSscore and nrKSdomains < 3: nrpspkstype = "Type I Iterative PKS" elif ( "Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist ) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and eneKSscore > modKSscore and eneKSscore > traKSscore and eneKSscore > iterKSscore and nrKSdomains < 3: nrpspkstype = "Type I Enediyne PKS" elif ( "Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist ) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and ( (modKSscore > eneKSscore and modKSscore > traKSscore and modKSscore > iterKSscore) or nrKSdomains > 3): nrpspkstype = "Type I Modular PKS" elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist ) and "PKS_KS" in domainlist and "PKS_AT" in domainlist: nrpspkstype = "PKS-like protein" elif ( "Condensation_LCL" in domainlist or "Condensation_DCL" in domainlist or "Condensation_Starter" in domainlist or "Cglyc" in domainlist or "Condensation_Dual" in domainlist or "AMP-binding" in domainlist ) and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist: nrpspkstype = "NRPS-like protein" else: nrpspkstype = "PKS/NRPS-like protein" if feature.qualifiers.has_key("sec_met"): feature.qualifiers['sec_met'].append("NRPS/PKS subtype: " + nrpspkstype) else: feature.qualifiers['sec_met'] = [ "NRPS/PKS subtype: " + nrpspkstype ] #Write motifs to seq_record motifFeatures = [] if pksnrpsvars.motifdict.has_key(k): motifs = pksnrpsvars.motifdict[k] counter = 1 for motif in motifs: if feature.location.strand == 1: start = feature.location.start + (3 * motif[1]) end = feature.location.start + (3 * motif[2]) else: end = feature.location.end - (3 * motif[1]) start = feature.location.end - (3 * motif[2]) loc = FeatureLocation(start, end, strand=feature.strand) motifFeature = SeqFeature( loc, type=options.FeatureTags.pksnrpsmotifs_tag) quals = defaultdict(list) quals['label'].append(str(motif[0])) if feature.qualifiers.has_key('locus_tag'): quals['locus_tag'] = feature.qualifiers['locus_tag'] else: quals['locus_tag'] = [k] quals['motif'] = [motif[0]] quals['asDomain_id'] = [ 'nrpspksmotif_' + '_'.join(quals['locus_tag']) + '_' + '{:04d}'.format(counter) ] counter += 1 quals['evalue'] = [str("{:.2E}".format(float(motif[3])))] quals['score'] = [str(motif[4])] quals['aSTool'] = ["pksnrpsmotif"] quals['detection'] = ["hmmscan"] quals['database'] = ["abmotifs"] if feature.qualifiers.has_key('transl_table'): [transl_table] = feature.qualifiers['transl_table'] else: transl_table = 1 quals['translation'] = [ str( motifFeature.extract(seq_record).seq.translate( table=transl_table)) ] quals['note'].append("NRPS/PKS Motif: " + motif[0] + " (e-value: " + str(motif[3]) + ", bit-score: " + str(motif[4]) + ")") motifFeature.qualifiers = quals motifFeatures.append(motifFeature) nrpspksdomains = pksnrpsvars.domaindict[k] for domain in nrpspksdomains: if feature.qualifiers.has_key("sec_met"): feature.qualifiers['sec_met'].append( "NRPS/PKS Domain: %s (%s-%s). E-value: %s. Score: %s;" % (domain[0], str(domain[1]), str( domain[2]), str(domain[3]), str(domain[4]))) else: feature.qualifiers['sec_met'] = [ "NRPS/PKS Domain: %s (%s-%s). E-value: %s. Score: %s;" % (domain[0], str(domain[1]), str( domain[2]), str(domain[3]), str(domain[4])) ] seq_record.features.extend(motifFeatures) pksnrpsvars.nrpspkstypedict[k] = nrpspkstype
def _annotate(seq_record, options, results): "Annotate seq_record with CDS_motifs for the result" logging.debug("generating feature objects for PFAM hits") min_score = _min_score(options) max_evalue = _max_evalue(options) feature_by_id = utils.get_feature_dict(seq_record) for r in results: i = 1 for hsp in r.hsps: if hsp.bitscore <= min_score or hsp.evalue >= max_evalue: continue if not feature_by_id.has_key(hsp.query_id): continue feature = feature_by_id[hsp.query_id] start, end = _calculate_start_end(feature, hsp) loc = FeatureLocation(start, end, strand=feature.strand) newFeature = SeqFeature(location=loc, type=options.FeatureTags.fullhmmer_tag) quals = defaultdict(list) quals['label'].append(r.id) if feature.qualifiers.has_key('locus_tag'): quals['locus_tag'] = feature.qualifiers['locus_tag'] else: quals['locus_tag'] = [hsp.query_id] quals['domain'] = [hsp.hit_id] quals['asDomain_id'] = ['fullhmmer_'+'_'.join(quals['locus_tag'])+'_'+'{:04d}'.format(i)] i += 1 quals['evalue'] = [str("{:.2E}".format(float(hsp.evalue)))] quals['score'] = [str(hsp.bitscore)] quals['aSTool'] = ["fullhmmer"] quals['detection'] = ["hmmscan"] quals['database'] = [path.basename(r.target)] if feature.qualifiers.has_key('transl_table'): [transl_table] = feature.qualifiers['transl_table'] else: transl_table = 1 quals['translation'] = [str(newFeature.extract(seq_record.seq).translate(table=transl_table))] quals['note'].append("%s-Hit: %s. Score: %s. E-value: %s. Domain range: %s..%s." % \ (path.basename(r.target), hsp.hit_id, hsp.bitscore, hsp.evalue, hsp.hit_start, hsp.hit_end)) quals['description'] = [hsp.hit_description] try: pfamid = name_to_pfamid[hsp.hit_id] if quals.has_key('db_xref'): quals['db_xref'].append("PFAM: %s" % pfamid) else: quals['db_xref'] = ["PFAM: %s" % pfamid] except KeyError: pass newFeature.qualifiers=quals seq_record.features.append(newFeature)
def agrupar_sitios(): regiones = list( GFF.parse("/data/organismos/ILEX_PARA2/regulation/ncbi_IP4.gff3.reg")) ids = 1 groups = {} for c in tqdm(regiones): groups[c.id] = [] for strand in [1, -1]: group = SeqFeature(id=c.features[0], type="grouped_transcription_regulatory_region", location=c.features[0].location) group.sub_features = [] fs = sorted([f for f in c.features if f.strand == strand], key=lambda x: x.location.start) if not fs: continue group.sub_features += [fs[0]] for f in fs[1:]: end = max([x.location.end for x in group.sub_features]) if ((abs(f.location.start - end) < 1500) or (set(range(f.location.start, f.location.end)) & set( range(group.sub_features[-1].location.start, group.sub_features[-1].location.end)))): group.sub_features.append(f) else: group.qualifiers = { "description": "_".join( sorted( set([ x.qualifiers["description"][0].split( " regulatory region")[0] for x in group.sub_features ]))), "ID": ["ILEXPARARR" + str(ids)] } ids += 1 group.location = FeatureLocation( start=min( [x.location.start for x in group.sub_features]), end=max([x.location.end for x in group.sub_features]), strand=f.location.strand) assert group.location.start < group.location.end if (group.location.end - group.location.start) > 5000: print(group.qualifiers["ID"]) groups[c.id].append(group) group = SeqFeature( id=c.features[0], type="grouped_transcription_regulatory_region", location=f.location) group.sub_features = [f] if group: group.qualifiers = { "description": "_".join( sorted( set([ x.qualifiers["description"][0].split( " binding site")[0] for x in group.sub_features ]))), "ID": ["ILEXPARARR" + str(ids)] } ids += 1 group.location = FeatureLocation( start=min([x.location.start for x in group.sub_features]), end=max([x.location.end for x in group.sub_features]), strand=f.location.strand) assert group.location.start < group.location.end if (group.location.end - group.location.start) > 5000: print(group.qualifiers["ID"]) groups[c.id].append(group) # for _, v in groups.items(): # for x in v: # x.sub_features = [] records = [ SeqRecord(id=k, name="", description="", seq=Seq(""), features=v) for k, v in groups.items() ] GFF.write(tqdm(records), open("/data/organismos/ILEX_PARA2/regulation/grouped.gff", "w"))