def annotate_gene(self, reaction, gene_name): try: proteins = Protein.objects(organism=self.organism, gene=gene_name) if not len(proteins): proteins = Protein.objects(organism=self.organism, alias__iexact=gene_name) if len(proteins): for protein in proteins: if not hasattr(protein, "reactions"): protein.reactions = [ ] # self.add_pathways_info_to_gene(reaction, protein) self.add_reaction_info_to_gene(reaction, protein) self.add_properties_to_gene(protein, gene_name) protein.save() else: _log.warn("%s not found" % gene_name) self.unmapped_genes.append(gene_name) except DoesNotExist: self.unmapped_genes.append(gene_name) _log.warn("%s not found" % gene_name) except MultipleObjectsReturned: self.unmapped_genes.append(gene_name) _log.warn(gene_name)
def update_proteins(annotation_dir, proteome, seq_col_name, tax_id, identity=0.9, cpus=multiprocessing.cpu_count(), db_init=None): # if db_init: # from SNDG.Sequence.ProteinAnnotator import PABase # PABase.sqldb.initialize(db_init) # mkdir(annotation_dir) # out = annotation_dir + "/species_blast.tbl" # # tax = Tax.select().where(Tax.ncbi_taxon_id == tax_id).get() # species_tax = None # for tax in Tax.parents(tax): # if tax.node_rank == "genus": # species_tax = tax # break # tax_data = "/data/xomeq/tax/" # species_fasta = tax_data + str(int(species_tax.ncbi_taxon_id)) + ".fasta" if not os.path.exists(out): if not os.path.exists(species_fasta): Uniprot.download_proteome_from_tax(str(species_tax.ncbi_taxon_id), tax_data) cmd = "blastp -query %s -db %s -evalue 0.00001 -outfmt 6 -max_hsps 1 -qcov_hsp_perc 0.9 -num_threads %i -out %s" execute(cmd % (proteome, species_fasta, cpus, out)) species_desc = { x.id.split("|")[1]: " ".join(x.description.split()[1:]) for x in bpio.parse(species_fasta, "fasta") } total = Protein.objects(organism=seq_col_name).count() with tqdm(bpsio.parse(out, "blast-tab"), total=total) as pbar: for query in pbar: pbar.set_description(query.id) if query[0][0].ident_pct > identity: unip = query[0].id.split( "|")[1] if "|" in query[0].id else query[0].id dbxrefs = [ x.db + "||" + x.value for x in Mapping.select().where(Mapping.uniprot == unip) ] p = Protein.objects(gene=query.id, organism=seq_col_name).no_cache().get() if not p.description and unip in species_desc: p.description = species_desc[unip].split( "OS=")[0] + " | homology with: " + unip p.save() if dbxrefs: p = SearchLoader.update_protein_with_dbxref( query.id, dbxrefs, seq_col_name) p.save()
def process( self, data_csv="/data/projects/23staphylo/raw/metadata/saureus_resist_snps.csv", genome="SaureusN315", user="******"): Protein.objects(organism=genome).update( __raw__={"$pull": { "features": { "type": "Aanensen2016" } }}) df = self.create_df(data_csv) for i, r in df.iterrows(): gene = r["Core gene"] gene = gene.strip() assert gene # regex = re.compile(gene + '.*') has_prot = False for prot in Protein.objects(organism=genome, gene__iexact=gene): has_prot = True # print len([x.gene[1] if len(x.gene) > 1 else x.gene[0] for x in prot]) self._process_prot(prot, r, i) if not has_prot: if "RNA" not in r["Core gene"]: if r["Core gene"]: print "Core gene not found %s" % gene else: print "%s not found" % gene
def _process_proteins(self, genome): total = Protein.objects(organism=genome.name, reactions__0__exists=True).count() iterprot = tqdm(Protein.objects(organism=genome.name, reactions__0__exists=True).no_cache().timeout( False),total=total) for protein in iterprot: # @UndefinedVariable keywords1 = self._protein_keywords(protein) keywords2 = self._add_drugability_props_to_protein(protein) protein.keywords = list( set([x.strip().lower() for x in keywords1 + keywords2 + protein.keywords if x.strip()])) protein.save() self.update_ont_org_idx()
def common_annotations(collection_name, tmp_dir, cpu=1, remove_tmp=False): process_pdb = Protein.objects( __raw__={ "organism": collection_name, "features.type": SO_TERMS["polypeptide_structural_motif"] }).count() process_hmm = not (Protein.objects( __raw__={ "organism": collection_name, "features.type": SO_TERMS["polypeptide_domain"] }).count()) _common_annotations(collection_name, tmp_dir, cpu, remove_tmp, process_pdb, process_hmm)
def update_protein_with_dbxref(protein_gene, annotations, organism): p = Protein.objects(gene=protein_gene, organism=organism).no_cache().get() for ann in annotations: word = ann.split("||")[1] if ann.lower().startswith("ec") or ann.lower().startswith("go"): p.ontologies.append(word) p.keywords.append(word) for attr in """EcoGene Ensembl_PRO Ensembl_TRS WormBase_PRO WormBase_TRS UniGene GeneDB EuPathDB""".split(): if ann.startswith(attr): p.keywords.append(word) p.alias.append(word) for g in ["Gene_Name", "Gene_OrderedLocusName", "Gene_ORFName"]: if ann.startswith(g): p.keywords.append(word) p.alias.append(word) p.gene.append(word) #if "Uncharacterized" in p.description: # p.description = p.ontologies = list(set([x.lower() for x in p.ontologies])) p.keywords = list(set([x.lower() for x in p.keywords])) p.alias = list(set(p.alias)) p.gene = [protein_gene] + [x for x in set(p.gene) if x != protein_gene] return p
def clean_structures(self, organism): proteins = list( Protein.objects(organism=organism).no_cache().timeout(False)) for p in proteins: experimentals = p.cristals() models = p.models() if experimentals and models: for model in models: for exp in experimentals: hit = [ f for f in p.features if f.identifier.startswith(exp.name) ][0] model_query = model.templates[0].aln_query model_range = set( range(model_query.start, model_query.end)) exp_range = set( range(hit.location.start, hit.location.end)) if ((len(model_range & exp_range) * 1.0 / len(model_range)) > 0.8): model.delete() if len(models) > 1: for i, j in combinations(range(len(models), 2)): if ((models[i].templates[0].aln_query.start == models[j].templates[0].aln_query.start) and (models[i].templates[0].aln_query.end == models[j].templates[0].aln_query.end)): models[i].delete()
def common_annotations(collection_name, tmp_dir, cpu=1, remove_tmp=False, pfam_db="/data/databases/xfam/Pfam-A.hmm", pdbs_path="/data/databases/pdb/pdb_seqres.txt"): process_pdb = not Protein.objects( __raw__={ "organism": collection_name, "features.type": SO_TERMS["polypeptide_structural_motif"] }).count() process_hmm = not (Protein.objects( __raw__={ "organism": collection_name, "features.type": SO_TERMS["polypeptide_domain"] }).count()) _common_annotations(collection_name, tmp_dir, cpu, remove_tmp, process_pdb, process_hmm, pfam_db, pdbs_path)
def build_index(self, query={}, error_output="/tmp/index_struct.log"): self.update_collection_params() if not query: proteins = Protein.objects( organism=self.collection.name).no_cache().timeout(False) prot_count = Protein.objects(organism=self.collection.name).count() else: query["organism"] = self.collection.name proteins = Protein.objects(__raw__=query).no_cache().timeout(False) prot_count = Protein.objects(__raw__=query).count() with tqdm(proteins, total=prot_count) as pbar: for protein in pbar: pbar.set_description(protein.name) try: if not protein.search: protein.search = ProteinDruggabilitySearch() self.initDrugabilitySearch(protein.search) cristals, models = self.get_protein_structures(protein) if len(cristals + models) > 0: self.annotate_protein_with_structures( cristals, models, protein) else: protein.keywords = [ x for x in protein.keywords if x not in [ "poorly_druggable", "druggable", "highly_druggable", "has_structure" ] ] protein.keywords.append("non_druggable") for x in StructuromeIndexer.search_params: protein.search[x[0]] = None protein.save() except Exception as ex: error_line = protein.name + "," + protein.organism + "," + str( protein.id) + "," + str(ex) _log.warn(error_line) with open(error_output, "a") as h: h.write(error_line)
def create_proteome(tmp_dir, collection_name): protein_fasta = tmp_dir + "/proteins.fasta" if not os.path.exists(protein_fasta) or ( not os.path.getsize(protein_fasta)): with open(protein_fasta, "w") as h: for p in Protein.objects(organism=collection_name).no_cache(): bpio.write( SeqRecord(id=p.gene[0], description="", seq=Seq(p.seq)), h, "fasta") return protein_fasta
def properties_from_feature(self, organism, feature_type, value_fn, property_name=None, url_fn=None): proteins = Protein.objects(__raw__={ "organism": organism, "features.type": feature_type }).no_cache().timeout(False) if property_name == "human_offtarget": res = self.db.proteins.update( { "organism": organism, "search.human_offtarget": { "$exists": True } }, {"$unset": { "search.human_offtarget": "" }}, multi=True) _log.info(res) for p in proteins: f = [x for x in p.features if x.type == feature_type] if f: f = f[0] bp = BioProperty( _type=feature_type, property=property_name if property_name else feature_type, value=value_fn(f), ) if url_fn: bp.url = url_fn(f) props = [ x for x in p.properties if not (x._type == feature_type and ( (not property_name) or (x.property == property_name))) ] p.properties = props p.properties.append(bp) if property_name == "human_offtarget": p.search.human_offtarget = 1 - f.aln.identity p.save() if property_name == "human_offtarget": res = self.db.proteins.update( { "organism": organism, "search.human_offtarget": { "$exists": False } }, {"$set": { "search.human_offtarget": 1 }}, multi=True) _log.info(res)
def load_pdb_domains(organism, blast_file, feature_type="SO:0001079", min_identity=0.9, min_query_coverage=0.9, min_hit_coverage=0.9): queries = list(bpsio.parse(blast_file, 'blast-xml')) features_added = 0 with tqdm(queries) as pbar: for query in queries: pbar.set_description(query.id) pfam, dnstart, dnend = query.id.split("_")[-3:] dnstart, dnend = int(dnstart), int(dnend) gene = "_".join(query.id.split("_")[:-3]) proteins = Protein.objects(organism=organism, gene=gene).no_cache().timeout(False) change = False for protein in proteins: for hit in query: hsp = hit[0] dn = [ x for x in protein.domains() if (abs(x.location.start - dnstart) < 10) and ( abs(x.location.end - dnend) < 10) and x.identifier.split(".")[0] == pfam.split(".")[0] ] if dn: dn = dn[0] ident_fn = lambda fident: "_".join( fident.split("_")[0:1] + fident.split("_")[-2:]) pdb = [ x for x in protein.features if x.type == feature_type and ident_fn(x.identifier) == ident_fn(hit.id) ] if not pdb: posSet = set( range(dn.location.start, dn.location.end)) dncover = 1.0 * len( posSet & set(range(dnstart, dnend))) / ( dn.location.end - dn.location.start) if (min_identity <= identity(hsp)) and ( dncover >= min_query_coverage): hsp_feature = BioDocFactory.feature_from_hsp( hsp, feature_type) hsp_feature.location.start += dnstart hsp_feature.location.end += dnstart features_added = features_added + 1 change = True protein.features.append(hsp_feature) if change: protein.save() _log.info("Features added: " + str(features_added))
def update_genome_props(self): if self.user == "demo": user2 = "" else: user2 = self.user + "." search_params = [("resistance", "Associated with resistance", "variant-db", SeqColDruggabilityParamTypes.value, ["true", "false"], "true", "equal", "avg")] search_params = search_params + [ (x.lower(), "Associated with " + x + " resistance", "variant-db", SeqColDruggabilityParamTypes.value, ["true", "false" ], "true", "equal", "avg") for x in Saureus.drugs ] SeqCollection.objects(name=self.organism).update( __raw__={ "$pull": { "druggabilityParams": { "target": "variant-db", "uploader": self.user } } }) collection = SeqCollection.objects(name=self.organism).get() for name, description, target, _type, options, defaultValue, defaultOperation, defaultGroupOperation in search_params: Protein.objects(organism=self.organism).update( __raw__={"$set": { "search." + user2 + name: False }}) if not collection.has_druggability_param(name): dp = SeqColDruggabilityParam(name=name, description=description, target=target, type=_type, uploader=self.user) dp.options = options dp.defaultValue = defaultValue dp.defaultOperation = defaultOperation dp.defaultGroupOperation = defaultGroupOperation collection.druggabilityParams.append(dp) collection.save()
def load_priam_hits(self, seq_collection_name, path_genomeEnzymes): for line in open(path_genomeEnzymes): arr_line = line.split("\t") ec = "ec:" + arr_line[0] prot_id = arr_line[1].split(" ")[0] for prot in Protein.objects(organism=seq_collection_name, alias=prot_id): prot.ontologies.append(ec) prot.keywords.append(ec) prot.save()
def load_from_emapper(self, organism, emapperv2_file): from SNDG.Annotation.EMapper import EMapper em = EMapper() em.read_file(emapperv2_file) for locus_tag, record in em.data.items(): prot = Protein.objects(organism=organism, gene=locus_tag).get() for ec in record["EC"].split(","): prot.ontologies.append("ec:" + ec) for go in record["GOs"].split(","): prot.ontologies.append(go.lower()) prot.save()
def load_hmm(organism, hmm_file, transform_query_regexp=None, transform_hit_regexp=None): assert os.path.exists(hmm_file) for query in tqdm(bpsio.parse(hmm_file, 'hmmer3-text')): for hit in query: for hsp in hit: gene = query.id if transform_query_regexp: gene = re.search(transform_query_regexp, query.id, re.IGNORECASE).group(1) hit_name = hit.id if transform_query_regexp: hit_name = re.search(transform_hit_regexp, hit_name, re.IGNORECASE).group(1) proteins = Protein.objects( organism=organism, alias=gene).no_cache().timeout(False) for protein in proteins: dn = [ d for d in protein.domains() if (d.identifier == hit_name) and ( d.location.start == hsp.query_start) and ( d.location.end == hsp.query_end) ] if dn: protein.features.remove(dn[0]) hsp_feature = Feature( _id=ObjectId(), location=Location(start=hsp.query_start, end=hsp.query_end), aln=SimpleAlignment( evalue=hsp.evalue, aln_query=AlnLine(name=hsp.query_id, seq=str(hsp.aln[0].seq), start=hsp.query_start, end=hsp.query_end), aln_hit=AlnLine(name=hsp.hit.id, seq=str(hsp.aln[1].seq), start=hsp.hit_start, end=hsp.hit_end), aln_cd=hsp.aln_annotation["CS"] if "CS" in hsp.aln_annotation else "", aln_pp=hsp.aln_annotation["PP"] if "PP" in hsp.aln_annotation else ""), identifier=hsp.hit.id, type=SO_TERMS["polypeptide_domain"]) protein.features.append(hsp_feature) protein.save()
def load_blast_features(organism, blast_file, feature_type, min_identity=0, min_query_coverage=0, min_hit_coverage=0): queries = list(bpsio.parse(blast_file, 'blast-xml')) def check_overlap(features, new_feature, max_aa_overlap): for f in features: if (1.0 * len(new_feature & f) / len(f)) > 0.8: return True return False features_added = 0 for query in tqdm(queries): gene = query.id proteins = Protein.objects(organism=organism, gene=gene).no_cache().timeout(False) change = False for protein in proteins: for hit in query: hsp = hit[0] if ((identity(hsp) >= min_identity) and (coverage(query, hsp) >= min_query_coverage) and (hit_coverage(hit, hsp) >= min_hit_coverage)): hsp_feature = BioDocFactory.feature_from_hsp( hsp, feature_type) features_added = features_added + 1 change = True protein.features.append(hsp_feature) elif (identity(hsp) >= min_identity) and (hit_coverage( hit, hsp) >= min_hit_coverage): for dn in protein.domains(): posSet = set(range(dn.location.start, dn.location.end)) dncover = 1.0 * len(posSet & set( range(hsp.query_start, hsp.query_end))) / ( dn.location.end - dn.location.start) if dncover >= min_query_coverage: hsp_feature = BioDocFactory.feature_from_hsp( hsp, feature_type) features_added = features_added + 1 change = True protein.features.append(hsp_feature) if change: protein.save() _log.info("Features added: " + str(features_added))
def main(argv=None): # IGNORE:C0111 program_version = "v%s" % __version__ program_build_date = str(__updated__) program_version_message = '%%(prog)s %s (%s)' % (program_version, program_build_date) program_shortdesc = __import__('__main__').__doc__.split("\n")[1] program_license = '''%s Created by user_name on %s. Copyright 2015 BIA. All rights reserved. Licensed under the Apache License 2.0 http://www.apache.org/licenses/LICENSE-2.0 Distributed on an "AS IS" basis without warranties or conditions of any kind, either express or implied. USAGE ''' % (program_shortdesc, str(__date__)) parser = ArgumentParser(description=program_license, formatter_class=RawDescriptionHelpFormatter) parser.add_argument("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %(default)s]") parser.add_argument("-n", "--name", required=True) parser.add_argument("-dir", "--structs_dir", required=True) parser.add_argument("-db_structure", "--db_structure",help="Mongo structure db", default='pdb') parser.add_argument("-db_genome", "--db_genome",help="Mongo proteins db", default='xomeq') parser.add_argument("-host", "--db_host", default='127.0.0.1') parser.add_argument('-V', '--version', action='version', version=program_version_message) args = parser.parse_args() BioMongoDB(args.db_genome) db = pymongo.MongoClient(args.db_host)[args.db_structure] sa = StructureAnotator(args.structs_dir + "/") total = sa.total(db, args.name, {}) with tqdm(sa.iterator(db, args.name, {}), total=total) as pbar: for model in pbar: pbar.set_description(model.name) template = model.templates[0] try: protein = Protein.objects(organism=args.name, alias=template.aln_query.name).get() except DoesNotExist: _log.warn(template.aln_query.name + " does not exists") sa.annotate_model(model, protein.domains()) model.save()
def load_from_interpro(self, organism, interprot_gff): for l in tqdm(open(interprot_gff)): if l.startswith(">"): break if l.startswith("##"): continue l = l.replace("EC=", "EC ") locus_tag, source, feature, start, end, score, strand, frame = l.split( "\t")[:8] attributes = " ".join(l.split("\t")[8:]) if feature == "polypeptide": continue start, end = int(start), int(end) if "signature_desc=" in attributes: repl = attributes.split("signature_desc=")[1].split( ";Name=")[0] attributes = attributes.replace( repl, repl.replace("=", "%3D").replace(";", "%3B")) attributes = { x.split("=")[0]: x.split("=")[1] for x in attributes.split(";") } # [seq,source,feature,start,end,score,strand,frame,attributes ]) feature = Feature(_id=ObjectId(), location=Location(start=start, end=end), identifier=attributes["Name"], type=source) prot = Protein.objects(organism=organism, gene=locus_tag).get() if "signature_desc" in attributes: feature.qualifiers = { "description": attributes["signature_desc"] } if "Ontology_term" in attributes: for ont in attributes["Ontology_term"].split(","): ont = ont.replace('"', "").strip() prot.ontologies.append(ont.lower()) if "Dbxref" in attributes: for ont in attributes["Dbxref"].split(","): ont = ont.replace('"', "").strip() prot.ontologies.append(ont.lower()) prot.features.append(feature) prot.save()
def props_from_dbxref(self, name): i = 0 for p in Protein.objects(organism=name).no_cache().timeout(False): if p.dbxrefs: i += 1 prop = BioProperty(_type="dbxref", property="links", value=p.dbxrefs) p.alias += [ x.split(":")[1] for x in p.dbxrefs if x.lower().startswith("uniprot") ] p.alias = list(set(p.alias)) p.properties.append(prop) p.save()
def organism_iterator(self, organism, seq_map=None): for dbcontig in Contig.objects(organism=organism).no_cache(): if seq_map: seq = str(seq_map[dbcontig.name].seq) else: seq = dbcontig.seq contig = SeqRecord(id=dbcontig.name, seq=Seq(seq)) for dbfeature in dbcontig.features: qualifiers = {"locus_tag": [dbfeature.locus_tag]} p = list( Protein.objects(organism=organism, gene=dbfeature.identifier)) if p: p = p[0] qualifiers["description"] = [p.description] qualifiers["gene_symbol"] = p.gene qualifiers["Note"] = [p.description] ecs = [ x.upper() for x in p.ontologies if x.startswith("ec:") ] gos = [ x.upper() for x in p.ontologies if x.startswith("go:") ] if ecs: qualifiers["EC"] = ecs if gos: qualifiers["GO"] = gos feature = SeqFeature(id=dbfeature.identifier, type=dbfeature.type, qualifiers=qualifiers, location=FeatureLocation( start=dbfeature.location.start, end=dbfeature.location.end, strand=dbfeature.location.strand)) contig.features.append(feature) yield contig
def correct_chokes(self, name): metabolites_in = defaultdict(list) metabolites_out = defaultdict(list) for p in self.db.proteins.find({ "organism": name, "reactions.0": { "$exists": True } }): for r in p["reactions"]: for m in r["products"]: metabolites_out[m["name"]].append(r["name"]) for m in r["substrates"]: metabolites_in[m["name"]].append(r["name"]) for m, r in metabolites_in.items(): if (len(set(r)) > 1): # or (self.db.proteins.count({"organism":name,"reactions.name": r[0]}) > 1)): del metabolites_in[m] for m, r in metabolites_out.items(): if (len(set(r)) > 1): # or (self.db.proteins.count({"organism":name,"reactions.name": r[0]}) > 1)): del metabolites_out[m] choke_reactions_in = [] for rs in metabolites_in.values(): choke_reactions_in += rs choke_reactions_out = [] for rs in metabolites_out.values(): choke_reactions_out += rs reaction_metabolites = defaultdict(lambda: []) for m, rs in metabolites_in.items(): for r in rs: reaction_metabolites[r].append(m) for m, rs in metabolites_out.items(): for r in rs: reaction_metabolites[r].append(m) for p in Protein.objects( organism=name, reactions__0__exists=True).no_cache().timeout(False): cout = bool( [r.name for r in p.reactions if r.name in choke_reactions_out]) cin = bool( [r.name for r in p.reactions if r.name in choke_reactions_in]) p.search.chokepoint = cout | cin if p.search.chokepoint: p.search.chokepoint_type = "double" if (cout & cin) else ( "production" if cout else "consuming") prop = [x for x in p.properties if x.property == "chokepoint"] if prop: prop = prop[0] prop.metabolites = [] for x in p.reactions: if x.name in reaction_metabolites: prop.metabolites += reaction_metabolites[x.name] else: prop = BioProperty(_type="pathways", property="chokepoint", metabolites=[], type=p.search.chokepoint_type) for x in p.reactions: if x.name in reaction_metabolites: prop.metabolites += reaction_metabolites[x.name] p.properties.append(prop) else: del p.search.chokepoint_type p.properties = [ x for x in p.properties if x.property != "chokepoint" ] p.save()
def create_protein(cls, seqrecord, feature, exons=[]): alias = cls.alias(feature) locus_tag = feature.qualifiers["locus_tag"][0] protein_name = locus_tag seq = str(seqrecord.seq) # if "translation" in feature.qualifiers: # seq = feature.qualifiers["translation"][0] # else: # if exons: # seq = str(reduce(Seq.__add__, [exon.extract(seqrecord.seq) for exon in exons]).translate()) # else: # seq = str(feature.extract(Seq(str(seqrecord.seq))).translate()) if "gene_symbol_source" in feature.qualifiers: try: int(feature.qualifiers["gene_symbol_source"][0]) protein_name = "" except: protein_name = feature.qualifiers['gene_symbol'][0] # elif "product" in mrna_feature.qualifiers: # protein_name = mrna_feature.qualifiers['product'][0] p = Protein(seq=seq, name=protein_name) if "description" in feature.qualifiers: protein_description = feature.qualifiers['description'][0] elif "Note" in feature.qualifiers: protein_description = feature.qualifiers['Note'][0] elif "product" in feature.qualifiers: protein_description = feature.qualifiers['product'][0] else: protein_description = "" bp = BioProperty( _type="annotation", description="homolog proteins and sources used for the annotation") if "protein_id" in feature.qualifiers: bp.ncbi_protein_id = feature.qualifiers["protein_id"][0] if "db_xref" in feature.qualifiers: bp.ncbi_db_xref = feature.qualifiers["db_xref"][0] if "Dbxref" in feature.qualifiers: bp.ncbi_db_xref = feature.qualifiers["Dbxref"][0] if "top_cog_hit" in feature.qualifiers: bp.cog = feature.qualifiers["top_cog_hit"][0] if "gene_symbol_source" in feature.qualifiers: bp.source = feature.qualifiers["gene_symbol_source"][0] if "gene_product_name_source" in feature.qualifiers: bp.source = feature.qualifiers["gene_product_name_source"][0] ecs = [] if "EC" in feature.qualifiers: ecs = ecs + [ x.lower() if x.lower().startswith("ec") else "ec:" + x.lower() for x in feature.qualifiers["EC"] if "." in x ] if "EC_number" in feature.qualifiers: ecs = ecs + [ x.lower() if x.lower().startswith("ec") else "ec:" + x.lower() for x in feature.qualifiers["EC_number"] if "." in x ] if "Dbxref" in feature.qualifiers: ecs = ecs + [ x.lower() if x.lower().startswith("ec") else "ec:" + x.lower() for x in feature.qualifiers["Dbxref"] if "." in x ] gos = [] if "db_xref" in feature.qualifiers: gos = gos + [ x.lower() for x in feature.qualifiers["db_xref"] if "GO:" in x and (x not in ["GO:0008150", "GO:0003674", "GO:0005575"]) ] if "GO" in feature.qualifiers: gos = gos + [ x.lower() for x in feature.qualifiers["GO"] if "GO:" in x and (x not in ["GO:0008150", "GO:0003674", "GO:0005575"]) ] if "Ontology_term" in feature.qualifiers: gos = [ x.lower() for x in feature.qualifiers["Ontology_term"] if "GO:" in x and ( x not in ["GO:0008150", "GO:0003674", "GO:0005575"]) ] ontologies = list(set(ecs + gos)) p.gene = list([locus_tag, protein_name ]) if locus_tag != protein_name else [locus_tag] p.name = protein_name p.description = protein_description p.ontologies = ontologies p.properties = [bp] p.alias = alias return p
model_filename = model_file.split("/")[-1] model_name = model_filename.split(".pdb")[0] hunt_pockets(model_file) build_assessments(os.path.dirname(model_file), os.path.basename(model_filename)) seq_name = "_".join(model_name.split("_")[:-4]) org_model_name = model_name template_name = "_".join(model_name.split("_")[-4:]) if ModeledStructure.objects(organism=organism, name=model_name).count(): continue prot = list(Protein.objects(organism=organism, gene=seq_name)) if len(prot) == 0: _log.warn("Not found: " + seq_name) continue aln = [ hit[0] for hit in list( bpsio.read( basepath + "/" + seq_name + "/profile_search.xml", "blast-xml")) if hit.id == template_name ][0] with open(model_file + ".json") as h: assessments = json.load(h) pockets = []
def from_TriTrypDB(name, gff, fasta, tax, tmp_dir=None): genome = {x.id: x for x in sp(fasta)} from BCBio import GFF import re annotation = list(GFF.parse(gff, base_dict=genome)) contig = annotation[0] seqCol = BioDocFactory.create_genome(name, contig, tax, Tax) seqCol.save() if not tmp_dir: tmp_dir = "/tmp/" + name + "/" mkdir(tmp_dir) gene_ids = {} with tqdm(annotation) as pbar: for contig in pbar: pbar.set_description(contig.id) if len(contig.seq) > 15000000: contig.seq = "" contigDoc, gene_ids2 = BioDocFactory.create_contig( contig, seqCol, type_map={ "rRNA": "rRNA", "ncRNA": "ncRNA", NCBI.f_mRNA: "gene", "exon": "exon", "gene": "gene", NCBI.f_CDS: NCBI.f_CDS, "rRNA": "rRNA", "tRNA": "tRNA", "tmRNA": "tmRNA", "snoRNA": "snoRNA", "three_prime_UTR": "three_prime_UTR", "five_prime_UTR": "five_prime_UTR" }) gene_ids.update(gene_ids2) contigDoc.save() prots = [] with tqdm(tritryp_protein_iter(annotation)) as pbar: for (protein, cds_f) in pbar: protDoc = Protein(seq=str(protein.seq), name=protein.id) if "description" in cds_f.qualifiers: protein_description = cds_f.qualifiers['description'][0] elif "Note" in cds_f.qualifiers: protein_description = cds_f.qualifiers['Note'][0] elif "product" in cds_f.qualifiers: protein_description = cds_f.qualifiers['product'][0] else: protein_description = "" protDoc.description = protein_description gos = [] if "Ontology_term" in cds_f.qualifiers: gos = [ x.lower() for x in cds_f.qualifiers["Ontology_term"] if "GO:" in x and ( x not in ["GO:0008150", "GO:0003674", "GO:0005575"]) ] note = cds_f.qualifiers["Note"][0].split( " ")[0] if "Note" in cds_f.qualifiers else "" ecs = ["ec:" + note] if re.match( '^[0-9]+\.[0-9\-]+\.[0-9\-]+\.[0-9\-]$', note) else [] ontologies = list(set(ecs + gos)) protDoc.gene = [protein.id] protDoc.ontologies = ontologies protDoc.alias = [protein.id] if len(protDoc.seq) > 30000: raise Exception("No existen proteinas tan largas...") protDoc.gene_id = gene_ids[protein.id] protDoc.organism = name protDoc.auth = str(BioMongoDB.demo_id) protDoc.seq_collection_id = seqCol prots.append(protDoc) if pbar.n and ((pbar.n % 1000) == 0): Protein.objects.insert(prots) prots = [] if prots: Protein.objects.insert(prots) _common_annotations(name, tmp_dir)
# pockets_json = model_file + ".pocket.json" # if os.path.exists(pockets_json): # rss = StructureAnotator.pocket_residue_set(pockets_json, model.get_atoms()) # strdoc.pockets = rss # strdoc.save() # except Exception as ex: # _log.error(ex) # # print ("OK!") sa = StructureAnotator(basepath, struct_path=lambda wd, modeldoc: glob("/".join([ wd, modeldoc.templates[0].aln_query.name, modeldoc. templates[0].aln_query.name, modeldoc.name, "*.pdb" ]))[0]) total = sa.total(db, organism, {}) with tqdm(sa.iterator(db, organism, {}), total=total) as pbar: for model in pbar: pbar.set_description(model.name) template = model.templates[0] try: protein = Protein.objects(organism=organism, alias=template.aln_query.name).get() except: print template.aln_query.name + " does not exists" continue sa.annotate_model(model, protein.domains()) model.save()
def annotate_variants_with_prots(organism_name, dbs, drugs, force=False): """ drugs: list of strings, example TBDream.drugs or Saureus.drugs """ for idx, p in enumerate( Protein.objects( __raw__={ "organism": organism_name, "features.qualifiers.strain": { "$exists": 1 } }).no_cache()): print idx pvariants = list( Variant.objects(organism=organism_name, gene__in=p.gene)) for vd in pvariants: if ((vd.search == None) or force): psearch = p.search del psearch.structures vd.search = psearch vd.ontologies = p.ontologies for r in p.reactions: for pw in r.pathways: vd.ontologies.append(pw) vd.ontologies = list(set(vd.ontologies)) for drug in drugs: vd.search[drug] = False vd.search["resistance"] = False for db in dbs: for sample_allele in vd.sample_alleles: aa_pos = sample_allele.aa_pos feature = [ f for f in p.features if (f.type == db) and (f.location.start == aa_pos) ] if feature: feature = feature[0] if feature._data["qualifiers"]: vd.search["resistance"] = True feature = [ f for f in p.features if (f.type == db) and (f.location.start == aa_pos) and ("mut" in f._data["qualifiers"]) and ( (f._data["qualifiers"]["mut"] == sample_allele.aa_alt) or ( (f._data["qualifiers"]["change"].lower() == "frameshift" and sample_allele. variant_type == "frameshift_variant"))) ] if feature: sample_allele.feature = feature[0] if sample_allele.feature._data["qualifiers"]: vd.search[sample_allele.feature._data[ "qualifiers"]["drug"]] = True # vd.save()
def annotate_variants(organism_name, strain_name, database, parse_change): """ parse_change: function that transforms dbvar.qualifiers["change"] into aa_ref, aa_alt """ collection = SeqCollection.objects(name=organism_name).get() prop = strain_name + "_" + database Protein.objects(organism=organism_name).update( __raw__={"$set": { "search." + prop: False }}) if not collection.has_druggability_param(prop): dp = SeqColDruggabilityParam(name=prop, description="Variant in strain " + strain_name + " is reported in " + database, target="variant-strain", type=SeqColDruggabilityParamTypes.value, uploader="demo") dp.options = ["true", "false"] dp.defaultValue = "true" dp.defaultOperation = "equal" dp.defaultGroupOperation = "avg" collection.druggabilityParams.append(dp) prop = strain_name + "_" + database + "_pos" Protein.objects(organism=organism_name).update( __raw__={"$set": { "search." + prop: False }}) if not collection.has_druggability_param(prop): dp = SeqColDruggabilityParam( name=prop, description="The position of the variant the strain " + strain_name + " is reported in " + database, target="variant-strain", type=SeqColDruggabilityParamTypes.value, uploader="demo") dp.options = ["true", "false"] dp.defaultValue = "true" dp.defaultOperation = "equal" dp.defaultGroupOperation = "avg" collection.druggabilityParams.append(dp) collection.save() for p in Protein.objects(__raw__={ "organism": organism_name, "features.qualifiers.strain": strain_name }).no_cache(): dbvars = [f for f in p.features if f.type == database] dirty = False if dbvars: strainvars = [ f for f in p.features if (f.type == "strain_variant") and ( f._data["qualifiers"]["strain"] == strain_name) ] for dbvar in dbvars: dirty = True for strainvar in strainvars: strainvar._data["qualifiers"]["ref_pos"] = False if dbvar.location.start == strainvar.location.start: p.search[strain_name + "_" + database + "_pos"] = True strainvar._data["qualifiers"]["ref_pos"] = dbvar._id try: dref, dalt = parse_change( dbvar._data["qualifiers"]["change"]) sref, salt = strainvar._data["qualifiers"][ "change"].strip().split("/") sref = sref.strip() salt = salt.strip() if (dref == sref) and (dalt == salt): p.search[strain_name + "_" + database] = True strainvar._data["qualifiers"][ "ref"] = dbvar._id except Exception as ex: _log.warn(ex) if (("frameshift" in dbvar._data["qualifiers"]["change"].lower()) and ("frameshift" in strainvar._data["qualifiers"] ["change"].lower())): p.search[strain_name + "_" + database] = True if dirty: p.save()
def load_metadata(self, organism_name, datafile, uploader=demo): import pandas as pd from tqdm import tqdm seqCollection = list(SeqCollection.objects(name=organism_name)) seqCollection = seqCollection[0] errors = [] upload = DataUpload(uploader=uploader, errors=errors) df = pd.read_table(datafile, comment="#", index_col=False) headerProperties = [ c for c in df.columns if c != BioMongoDB.GENE_FIELD_IMPORT ] prots = Protein.objects(organism=organism_name) for hp in headerProperties: prots.update( __raw__={ "$pull": { "properties": { "property": hp, "_type": uploader } }, "$unset": { "search." + hp: "" } }) upload.properties = headerProperties numericFields = [] for k, v in dict(df.dtypes).items(): if v not in [np.float64, np.int64]: df[k] = df[k].astype('category') else: numericFields.append(k) assert BioMongoDB.GENE_FIELD_IMPORT in df.columns for linenum, fields in tqdm(df.iterrows()): gene = fields[BioMongoDB.GENE_FIELD_IMPORT] if not gene: text = str(linenum) + " gene field is empty" errors.append(text) continue count = Protein.objects(organism=organism_name, alias=gene).count() if not count: text = str( linenum ) + " " + gene + " does not exists in " + organism_name print(text) errors.append(text) continue prots = Protein.objects(organism=organism_name, alias=gene) for propertyName in headerProperties: prop = {"_type": uploader, "value": fields[propertyName]} prop["property"] = propertyName prots.update( __raw__={ "$push": { "properties": prop }, "$set": { "search." + propertyName: fields[propertyName] } }) for p in headerProperties: dpType = "number" if p in numericFields else "value" options = [] if p in numericFields else list(set(df[p])) currentDp = seqCollection.druggabilityParam(p, uploader) if currentDp: currentDp = currentDp[0] currentDp.options = options currentDp.type = dpType else: dp = SeqColDruggabilityParam(type=dpType, name=p, options=options, uploader=uploader, target="protein") seqCollection.druggabilityParams.append(dp) seqCollection.uploads.append(upload) seqCollection.save()
def protein_fasta(outfile_path, organism): with open(outfile_path, "w") as h: for p in Protein.objects(organism=organism).no_cache(): r = SeqRecord(id=p.gene[0], description="", seq=Seq(p.seq)) bpio.write(r, h, "fasta")