def create_features_from_contig( seqrecord, source, type_map={x: x for x in NCBI.ftypes}, extract_annotation_feature=lambda feature: feature): ftypes = {xx: 1 for xx in type_map} features = [] gene_ids = {} for feature in seqrecord.features: f = extract_annotation_feature(feature) if f.type in ftypes: fid = f.qualifiers["description"][ 0] if "description" in f.qualifiers else f.id if "ID" in f.qualifiers: fid = f.qualifiers["ID"][0] if "product" in f.qualifiers: fid = f.qualifiers["product"][0] if "gene_id" in f.qualifiers: fid = f.qualifiers["gene_id"][0] if "gene" in f.qualifiers: fid = f.qualifiers["gene"][0] if "protein_id" in f.qualifiers: fid = f.qualifiers["protein_id"][0] if "tRNA_anti-codon" in f.qualifiers: fid = fid + " -> " + f.qualifiers["tRNA_anti-codon"][0] fdoc = Feature(_id=ObjectId(), identifier=fid, location=Location(start=f.location.start, end=f.location.end, strand=f.location.strand), type=type_map[feature.type]) if "locus_tag" in f.qualifiers: locus_tag = f.qualifiers["locus_tag"][0] fdoc.identifier = locus_tag fdoc.locus_tag = locus_tag fdoc.alias.append(fdoc.locus_tag) else: fdoc.locus_tag = fid gene_ids[fdoc.locus_tag] = fdoc._id if "gene" in f.qualifiers: fdoc.alias.append(f.qualifiers["gene"][0]) if "protein_id" in f.qualifiers: fdoc.alias.append(f.qualifiers["protein_id"][0]) if "old_locus_tag" in f.qualifiers: fdoc.alias = fdoc.alias + f.qualifiers["old_locus_tag"] if source: fdoc.source = source features.append(fdoc) return (features, gene_ids)
def load_hmm(organism, hmm_file, transform_query_regexp=None, transform_hit_regexp=None): assert os.path.exists(hmm_file) for query in tqdm(bpsio.parse(hmm_file, 'hmmer3-text')): for hit in query: for hsp in hit: gene = query.id if transform_query_regexp: gene = re.search(transform_query_regexp, query.id, re.IGNORECASE).group(1) hit_name = hit.id if transform_query_regexp: hit_name = re.search(transform_hit_regexp, hit_name, re.IGNORECASE).group(1) proteins = Protein.objects( organism=organism, alias=gene).no_cache().timeout(False) for protein in proteins: dn = [ d for d in protein.domains() if (d.identifier == hit_name) and ( d.location.start == hsp.query_start) and ( d.location.end == hsp.query_end) ] if dn: protein.features.remove(dn[0]) hsp_feature = Feature( _id=ObjectId(), location=Location(start=hsp.query_start, end=hsp.query_end), aln=SimpleAlignment( evalue=hsp.evalue, aln_query=AlnLine(name=hsp.query_id, seq=str(hsp.aln[0].seq), start=hsp.query_start, end=hsp.query_end), aln_hit=AlnLine(name=hsp.hit.id, seq=str(hsp.aln[1].seq), start=hsp.hit_start, end=hsp.hit_end), aln_cd=hsp.aln_annotation["CS"] if "CS" in hsp.aln_annotation else "", aln_pp=hsp.aln_annotation["PP"] if "PP" in hsp.aln_annotation else ""), identifier=hsp.hit.id, type=SO_TERMS["polypeptide_domain"]) protein.features.append(hsp_feature) protein.save()
def load_from_interpro(self, organism, interprot_gff): for l in tqdm(open(interprot_gff)): if l.startswith(">"): break if l.startswith("##"): continue l = l.replace("EC=", "EC ") locus_tag, source, feature, start, end, score, strand, frame = l.split( "\t")[:8] attributes = " ".join(l.split("\t")[8:]) if feature == "polypeptide": continue start, end = int(start), int(end) if "signature_desc=" in attributes: repl = attributes.split("signature_desc=")[1].split( ";Name=")[0] attributes = attributes.replace( repl, repl.replace("=", "%3D").replace(";", "%3B")) attributes = { x.split("=")[0]: x.split("=")[1] for x in attributes.split(";") } # [seq,source,feature,start,end,score,strand,frame,attributes ]) feature = Feature(_id=ObjectId(), location=Location(start=start, end=end), identifier=attributes["Name"], type=source) prot = Protein.objects(organism=organism, gene=locus_tag).get() if "signature_desc" in attributes: feature.qualifiers = { "description": attributes["signature_desc"] } if "Ontology_term" in attributes: for ont in attributes["Ontology_term"].split(","): ont = ont.replace('"', "").strip() prot.ontologies.append(ont.lower()) if "Dbxref" in attributes: for ont in attributes["Dbxref"].split(","): ont = ont.replace('"', "").strip() prot.ontologies.append(ont.lower()) prot.features.append(feature) prot.save()
def feature_from_hsp(hsp, feature_type): return Feature(_id=ObjectId(), location=Location(start=hsp.query_start, end=hsp.query_end), aln=SimpleAlignment( evalue=hsp.evalue, aln_query=AlnLine(name=hsp.query.id, seq=str(hsp.aln[0].seq), start=hsp.query_start, end=hsp.query_end), aln_hit=AlnLine(name=hsp.hit.id, seq=str(hsp.aln[1].seq), start=hsp.hit_start, end=hsp.hit_end), aln_mid=hsp.aln_annotation["similarity"] if "similarity" in hsp.aln_annotation else ""), identifier=hsp.hit.id, type=feature_type)
def _process_prot(self, prot, r, i): try: pos = int(r.Substitution.split(",")[0][1:-1]) - 1 start = pos end = pos except: if r.Substitution == "Deletions": start = 0 end = len(prot.seq) - 1 else: _log.warn("error parsing subtitution position: %s -> %s" % (r["Core gene"], r["Substitution"])) return quals = { "drug": r.Antibiotic, "change": r.Substitution, "gene": r["Core gene"] } if r.Reference: quals["reference"] = r.Reference fvariant = Feature(_id=ObjectId(), location=Location(start=start, end=end), type="Aanensen2016", identifier="Aanensen2016_ " + str(i), qualifiers=quals) prot.features.append(fvariant) prot.save() if self.user == "demo": prot.search.resistance = True prot.search[r.Antibiotic.lower()] = True prot.save() else: self.db.proteins.update({"_id": prot.id}, { "$set": { "search." + self.user + ".resistance": True, "search." + self.user + "." + r.Antibiotic.lower(): True } })
def from_ref_seq( name, ann_path, seqs=None, tax=None, tmp_dir=None, extract_annotation_feature=lambda feature: feature.sub_features[0] if feature.type == "gene" and hasattr(feature, "sub_features") and len( feature.sub_features) else feature, accept_protein_feature=lambda f: ( (f.type == "CDS") and ("translation" in f.qualifiers)), extract_sequence=lambda c, f: f.qualifiers["translation"][0] if "translation" in f.qualifiers else f.extract(c).seq.translate(), cpus=1): if seqs: seqs = {r.id: r.seq for r in bpio.parse(seqs, "fasta")} iter_seqs = list(sp(ann_path, seqs=seqs) if seqs else sp(ann_path)) for contig in iter_seqs: if has_tax: seqCol = BioDocFactory.create_genome(name, contig, tax, Tax) else: seqCol = BioDocFactory.create_genome(name, contig) seqCol.save() break if not tmp_dir: tmp_dir = "/tmp/" + name + "/" mkdir(tmp_dir) gene_ids = {} with tqdm(iter_seqs) as pbar: for contig in pbar: pbar.set_description(contig.id) if len(contig.seq) > 15000000: contig.seq = "" contigDoc, gene_ids2 = BioDocFactory.create_contig( contig, seqCol, type_map={ "rRNA": "rRNA", "ncRNA": "ncRNA", NCBI.f_mRNA: NCBI.f_mRNA, "gene": "gene", NCBI.f_CDS: NCBI.f_CDS, "rRNA": "rRNA", "tRNA": "tRNA", "tmRNA": "tmRNA" }, extract_annotation_feature=extract_annotation_feature, ) gene_ids.update(gene_ids2) contigDoc.save() prots = [] with tqdm( _protein_iter( iter_seqs, accept_feature=accept_protein_feature, extract_annotation_feature=extract_annotation_feature, extract_sequence=extract_sequence)) as pbar: for (protein, cds_f) in pbar: if "locus_tag" in cds_f.qualifiers: protDoc = BioDocFactory.create_protein(protein, cds_f) if len(protDoc.seq) > 30000: raise Exception("No existen proteinas tan largas...") if protDoc.seq.count("*") > 1: print( f"{cds_f.qualifiers['locus_tag'][0]}: Too many stop codons!" ) continue if protDoc.seq.count("+") > 1: print( f"{cds_f.qualifiers['locus_tag'][0]}: + signs found...!" ) continue protDoc.gene_id = gene_ids[cds_f.qualifiers["locus_tag"][0]] protDoc.organism = name protDoc.auth = str(BioMongoDB.demo_id) protDoc.seq_collection_id = seqCol for f in protein.features: protDoc.features.append( Feature(identifier=f.qualifiers["Ontology_term"][0], type=f.type, location=Location(start=int(f.location.start), end=int(f.location.end)))) prots.append(protDoc) if pbar.n and ((pbar.n % 1000) == 0): Protein.objects.insert(prots) prots = [] if prots: Protein.objects.insert(prots) # _common_annotations(name, tmp_dir, cpu=cpus) return seqCol
def load_in_sndg(self, organism="H37Rv"): from SNDG.BioMongo.Model.Protein import Protein from SNDG.BioMongo.Model.Feature import Feature, Location from SNDG.BioMongo.Model.SeqCollection import SeqCollection from SNDG.BioMongo.Model.SeqColDruggabilityParam import SeqColDruggabilityParamTypes, SeqColDruggabilityParam from bson.objectid import ObjectId search_params = [("resistance", "Associated with resistance", "variant-db", SeqColDruggabilityParamTypes.value, ["true", "false"], "true", "equal", "avg") ] search_params = search_params + [ (x, "Associated with " + x + " resistance", "variant-db", SeqColDruggabilityParamTypes.value, ["true", "false"], "true", "equal", "avg") for x in TBDream.drugs ] Protein.objects(organism=organism).update(__raw__={"$pull": {"features": {"type": "tbdream"}}}) collection = SeqCollection.objects(name=organism).get() for name, description, target, _type, options, defaultValue, defaultOperation, defaultGroupOperation in search_params: Protein.objects(organism=organism).update(__raw__={"$set": {"search." + name: False}}) if not collection.has_druggability_param(name): dp = SeqColDruggabilityParam(name=name, description=description, target=target, type=_type, uploader="demo") dp.options = options dp.defaultValue = defaultValue dp.defaultOperation = defaultOperation dp.defaultGroupOperation = defaultGroupOperation collection.druggabilityParams.append(dp) collection.save() for rv, rows in self._df.groupby("rv"): prot = list(Protein.objects(organism=organism, gene__iexact=rv)) if prot: prot = prot[0] for _, r in rows.iterrows(): mut = None if r.change: change = str(r.change[0]) + "/" + str(r.change[1]) mut = SeqUtils.seq1(r.change[1]) else: change = r.AminoAcid if math.isnan(r.codon): try: pos = int(r.AminoAcid) except: _log.warn("couldnt find the variant position") continue else: pos = int(r.codon) try: res, t = r.RTotalIsolates.strip().split("/") r_div_total_coef = int(res) * 1.0 / int(t) r_div_total = r.RTotalIsolates.strip() except: r_div_total = None r_div_total_coef = None quals = { "drug": r.Drug, "change": change, "gene": r.GeneID, "pattern": r.ResistancePattern, "additional": r.AdditionalMutations, "r_div_total": r_div_total, "r_div_total_coef": r_div_total_coef, "mic": r.MIC} if mut: quals["mut"] = mut fvariant = Feature(_id=ObjectId(), location=Location(start=pos, end=pos), type="tbdream", identifier="TBDream id " + r.ID, qualifiers=quals) prot.features.append(fvariant) prot.search.resistance = True prot.search[r.Drug] = True prot.save()