def offtarget(organism, offtarget_databases, offtarget_names, tmp_dir=None): if not tmp_dir: tmp_dir = "/data/organismos/" + organism + "/annotation/" mkdir(tmp_dir) proteins = tmp_dir + "proteins.fasta" if not os.path.exists(proteins): BioMongoDB.protein_fasta(proteins, organism) results = Offtarget.offtargets(proteins, tmp_dir, offtarget_databases) for i, name in enumerate(offtarget_names): load_blast_features(organism, results[i], name, min_identity=0.4, min_query_coverage=0.4, min_hit_coverage=0.4)
def main(argv=None): # IGNORE:C0111 program_version = "v%s" % __version__ program_build_date = str(__updated__) program_version_message = '%%(prog)s %s (%s)' % (program_version, program_build_date) program_shortdesc = __import__('__main__').__doc__.split("\n")[1] program_license = '''%s Created by user_name on %s. Copyright 2015 BIA. All rights reserved. Licensed under the Apache License 2.0 http://www.apache.org/licenses/LICENSE-2.0 Distributed on an "AS IS" basis without warranties or conditions of any kind, either express or implied. USAGE ''' % (program_shortdesc, str(__date__)) parser = ArgumentParser(description=program_license, formatter_class=RawDescriptionHelpFormatter) parser.add_argument("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %(default)s]") parser.add_argument("-n", "--name", required=True) parser.add_argument("-dir", "--structs_dir", required=True) parser.add_argument("-db_structure", "--db_structure",help="Mongo structure db", default='pdb') parser.add_argument("-db_genome", "--db_genome",help="Mongo proteins db", default='xomeq') parser.add_argument("-host", "--db_host", default='127.0.0.1') parser.add_argument('-V', '--version', action='version', version=program_version_message) args = parser.parse_args() BioMongoDB(args.db_genome) db = pymongo.MongoClient(args.db_host)[args.db_structure] sa = StructureAnotator(args.structs_dir + "/") total = sa.total(db, args.name, {}) with tqdm(sa.iterator(db, args.name, {}), total=total) as pbar: for model in pbar: pbar.set_description(model.name) template = model.templates[0] try: protein = Protein.objects(organism=args.name, alias=template.aln_query.name).get() except DoesNotExist: _log.warn(template.aln_query.name + " does not exists") sa.annotate_model(model, protein.domains()) model.save()
) / len(reactions_with_gene) ont = self.db.ontologies.find_one({"term": pw.lower()}) if ont: name = ont["name"] else: name = pw pw_obj = PathwaySumary(term=pw, name=name, count=pws_dict[pw]["genes"], properties=pws_dict[pw]) self.pathways.append(pw_obj) if __name__ == "__main__": from SNDG.BioMongo.Process.BioMongoDB import BioMongoDB from SNDG.BioMongo.Process.Importer import _common_annotations, _protein_iter, import_kegg_annotation, \ index_seq_collection, build_statistics, load_pathways mdb = BioMongoDB("tdr", port=27018) # ps = PathwaysAnnotator(mdb.db, "SaureusN315", "/data/organismos/SaureusN315/pathways/") # ps.sbml("Red_Staphylo_Curada_rs.sbml") # ps.species_filter("allfilters_con_c.dat") # ps.extract_genes_from_notes(lambda notes: gene_name_regexp.findall(notes)) # ps.annotate() # index_seq_collection(mdb.db, "SaureusN315", pathways=True, go=True, keywords=True, ec=True, organism_idx=True, # structure=False) build_statistics(mdb.db, "SaureusN315")
from SNDG.BioMongo.Process.BioDocFactory import BioDocFactory from SNDG.BioMongo.Model.Protein import Protein, ChEMBL from SNDG.Network.KEGG import Kegg from SNDG.BioMongo.Process.Importer import _common_annotations, _protein_iter, import_kegg_annotation, \ index_seq_collection, build_statistics, load_pathways from BCBio import GFF from SNDG.BioMongo.Process.Taxon import Tax from SNDG.BioMongo.Model.Structure import ModeledStructure, Molecule, ResidueAln, SimpleAlignment, StructureQuality, \ ExperimentalStructure, Chain,SeqCollection from SNDG.BioMongo.Model.Alignment import AlnLine import os from SNDG.BioMongo.Process.StructureAnotator import StructureAnotator import Bio.SearchIO as bpsio from Bio.SeqUtils import seq1, seq3 tax_db.initialize(MySQLDatabase('bioseqdb', user='******', passwd="mito")) mdb = BioMongoDB("tdr", port=27017) mysqldb = ProteinAnnotator.connect_to_db(database="unipmap", user="******", password="******") orgs = [ ("Mpylori26695", "Helicobacter pylori 26695 (e-proteobacteria)", "/data/organismos/Mpylori26695/GCF_000008525.1_ASM852v1_genomic.gbff", 85962), ("MpyloriIndia", "Helicobacter pylori India7 (e-proteobacteria)", "/data/organismos/MpyloriIndia/GCF_000185185.1_ASM18518v1_genomic.gbff", 907238), ] for name, org, ann_path, tax in orgs: organism = name
from SNDG.BioMongo.Process.Importer import from_ref_seq, update_proteins, import_prop_blast from SNDG.BioMongo.Process.BioDocFactory import BioDocFactory from SNDG.BioMongo.Model.Protein import Protein from SNDG.Network.KEGG import Kegg from SNDG.BioMongo.Process.Importer import _common_annotations, _protein_iter, import_kegg_annotation, \ index_seq_collection, build_statistics, load_pathways from BCBio import GFF from SNDG.BioMongo.Model.Structure import ModeledStructure, Molecule, ResidueAln, SimpleAlignment, StructureQuality, \ ExperimentalStructure,Chain from SNDG.BioMongo.Model.Alignment import AlnLine import os from SNDG.BioMongo.Process.StructureAnotator import StructureAnotator import Bio.SearchIO as bpsio mdb = BioMongoDB("tdr", port=27017) name = "Ainsu2" organism = name org = "Achromobacter insuavis AXX-A" ann_path = "/data/organismos/Ainsu/GCF_000219745.1_ASM21974v1_genomic.gbff" # from_ref_seq(name, ann_path, cpus=3) mdb.protein_fasta("/data/organismos/" + name + "/annotation/proteins.faa", name) # from SNDG.Annotation.EMapper import EMapper # em = EMapper() # em.read_file("proteins.") #update_proteins("/tmp/" + name + "/", "/data/organismos/" + name + "/annotation/proteins.faa", name, 1003200, db_init=mysqldb) # #
from SNDG.BioMongo.Process.BioMongoDB import BioMongoDB from SNDG.BioMongo.Process.Importer import index_seq_collection, build_statistics import pymongo from tqdm import tqdm mdb = BioMongoDB("saureus", 27019) ## Script para aplicar el curado manual de fede data = open( "/data/organismos/ILEX_PARA2/curacion/24082018_auto.txt").read().split("#") import re # ecex = re.compile("^ec") # for l in tqdm(data): # genes, desc, ec = [x.strip() for x in l.strip().split("\n") if x] # genes = genes.split("==") # # try: # gs = [mdb.db.proteins.find_one({"organism": "ILEX_PARA", "alias": x.strip()}, {"gene": 1})["gene"][0] for x in genes if # x.startswith("Ilex")] # # except: # # print(l.strip().split("\n")) # ts = [x.strip() for x in genes if x.startswith("ILEX")] # # for g in gs: # sets = {"description": desc} # if "Caffeine synthase" in desc: # num = "" # if len(desc.split(" ")) == 3: # num = desc.split(" ")[2] # sets["gene"] = [g, "CS" + num] # sets["name"] = "CS" + num
from argparse import RawDescriptionHelpFormatter os.environ["COMPOUND_TYPES_PATH"] = os.getenv( 'COMPOUND_TYPES_PATH', "/target/data/compound_type.csv") from SNDG.BioMongo.Process.BioMongoDB import BioMongoDB if __name__ == "__main__": argv = sys.argv parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter) parser.add_argument("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %(default)s]") parser.add_argument("-host", "--db_host", default='127.0.0.1') parser.add_argument("--port", default=27017) parser.add_argument("-db", "--db_name", default='tdr') parser.add_argument("--pdbs_path", required=True) parser.add_argument("--organism_name", required=True) parser.add_argument("--remove_tmp", action='store_true') parser.add_argument("--cpu", default=4) parser.add_argument("--tmp_dir", default="./annotation/") args = parser.parse_args() mdb = BioMongoDB(args.db_name, port=args.port, host=args.db_host) _common_annotations(args.organism_name, args.tmp_dir, args.cpu, args.remove_tmp, True, False, None, args.pdbs_path)
from SNDG.BioMongo.Process.BioMongoDB import BioMongoDB from SNDG.BioMongo.Process.Importer import from_ref_seq, update_proteins, create_proteome from SNDG.BioMongo.Process.Taxon import tax_db from SNDG.WebServices.NCBI import ExternalAssembly, mysql_db from peewee import MySQLDatabase from SNDG.Sequence.ProteinAnnotator import ProteinAnnotator, Mapping from SNDG.BioMongo.Process.Index import index_seq_collection, build_statistics Entrez.email = "*****@*****.**" _log = logging.getLogger(__name__) if __name__ == "__main__": logger = logging.getLogger('peewee') logger.setLevel(logging.INFO) init_log() mdb = BioMongoDB("saureus") tax_db.initialize(MySQLDatabase('bioseqdb', user='******', passwd="mito")) mysql_db.initialize(MySQLDatabase('sndg', user='******', passwd="mito")) assemblies = list(ExternalAssembly.select().where( ExternalAssembly.sample_source.is_null(False))) ProteinAnnotator.connect_to_db(database="unipmap", user="******", password="******") with tqdm(assemblies) as pbar: for x in pbar: if mdb.seq_col_exists(x.assembly_accession): continue pbar.set_description(x.assembly_accession) try: dst_dir = "/data/organismos/" + x.assembly_accession + "/annotation/"
def qa (model_path): ...: if not os.path.exists(model_path + ".json"): ...: assessment = QMean.assesment(model_path) ...: with open(model_path + ".json", "w") as h: ...: json.dump(assessment, h) ...: ...: p = Pool(3) ...: list(tqdm(p.imap_unordered(qa,model_files,100))) """ models_count = len(model_files) seq_col_id = ObjectId("5b2800b1be737e35a6dd9b8a") BioMongoDB("tdr") db = pymongo.MongoClient().pdb # with tqdm(model_files) as pbar: # for model_file in pbar: # model_name = model_file.split("/")[-2] # pbar.set_description("processing %s" % model_name) # # seq_name = model_file.split("/")[-3] # aln = [hit[0] for hit in list(bpsio.read(basepath + "/" + seq_name + "/profile_search.xml", "blast-xml")) if # hit.id == model_name.split(seq_name + "_")[1]][0] # # with open(model_file + ".json") as h: # assessments = json.load(h) # pockets = [] #
with open(track_list_path, "w") as handle: json.dump(data, handle, indent=4, separators=(',', ': ')) if __name__ == "__main__": import argparse import SNDG init_log() parser = argparse.ArgumentParser(description='Profile utils') parser.add_argument('--db', default="tdr", help='database name. default tdr') parser.add_argument('--name', required=True, help='organism name') args = parser.parse_args() SNDG.DEFAULT_SNDG_EXEC_MODE = "raw" mdb = BioMongoDB(args.db) jw = JBrowse(db=mdb.db) jw.create_genome(args.name) print("se crearon los archivos /tmp/jbrowse_g.gff y /tmp/jbrowse_g.fasta") # jw.load_sequences("/data/organismos/Pext14-3B/annotation//GCF_000242115.1_Pext14-3B_1.0_genomic.gbff") # jw.create_genome("Pext14-3B") # for s in [ "15-6324_S3_L001","2003_S4_L001"]: # vcf = "/data/projects/PiuriTB/analysis/variant_call_h37/" + s + "/variants.ann.vcf" # bam = "/data/projects/PiuriTB/analysis/reads_h37rv_aln/" + s + "/final_bwa.bam" # jw.add_strain("H37Rv",s, vcf , bam)
from SNDG.BioMongo.Process.BioMongoDB import BioMongoDB from SNDG.BioMongo.Model.SeqCollection import SeqCollection, SeqColDruggabilityParam from SNDG.WebServices.Offtargeting import Offtargeting from SNDG import init_log, mkdir, execute from SNDG.WebServices import PROXIES import os PROXIES["ftp_proxy"] = "http://proxy.fcen.uba.ar:8080" init_log() mdb = BioMongoDB("tdr", port=27018) off_props = { "human_offtarget": SeqColDruggabilityParam( **{ "target": "protein", "defaultGroupOperation": "max", "defaultValue": 0.6, "name": "human_offtarget", "defaultOperation": ">", "_cls": "SeqColDruggabilityParam", "uploader": "demo", "_class":
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter) parser.add_argument("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %(default)s]") parser.add_argument("-host", "--db_host", default='127.0.0.1') parser.add_argument("-db", "--db_name", default='tdr') parser.add_argument( "--pdb_entries", default='/data/databases/pdb/entries.idx') parser.add_argument( "--pdbs", default='/data/databases/pdb/') parser.add_argument( "--pdb_timeout", default=60,type=int) parser.add_argument( "--review_pockets", action="store_true") parser.add_argument("--organism_name", default=None) args = parser.parse_args() mdb = BioMongoDB(args.db_name,host=args.db_host) pdbUtils = PDBs(pdb_dir=args.pdbs) db = MongoClient(args.db_host)["pdb"] col_name = "pdb" if not os.path.exists(args.pdb_entries): sys.stderr.write("%s does not exists" % args.pdb_entries) sys.exit(1) """ collection = SeqCollection.objects(name=col_name) if len(collection): collection = collection.get() else:
parser.add_argument("-l", "--log_path", default=None) args = parser.parse_args() _log = logging.getLogger("protein_annotation") if not args.log_path: args.log_path = "/tmp/annotation.log" init_log(args.log_path, logging.INFO) pa = ProteinAnnotator() pa.connect_to_db(database=args.db_annotation, user=args.user_db, password=args.user_pass) BioMongoDB(args.mongo_db, host=args.mdb_host, port=args.mdb_port) if not os.path.exists(args.blast): _log.info(args.blast + " does not exists, running blast...") if args.fasta: assert os.path.exists(args.fasta), args.fasta + " does not exists" fasta = args.fasta else: _log.info("no fasta input, using proteins from the mongo db") fasta = tempfile.mktemp() with open(fasta, "w") as h: for p in Protein.objects(organism=args.organism): r = SeqRecord(id=p.gene[0], description="", seq=Seq(p.seq)) bpio.write(r, h, "fasta") execute(
from SNDG.BioMongo.Process.Importer import load_pathways, build_statistics from SNDG.BioMongo.Process.BioCyc2Mongo import BioCyc from SNDG.BioMongo.Process.BioMongoDB import BioMongoDB from SNDG.BioMongo.Model.SeqCollection import SeqCollection import pymongo port = 27018 mdb = BioMongoDB("tdr", port=port) db = pymongo.MongoClient(port=port).pdb load_pathways("cruzi", "/data/organismos/cruzi/pathways/pathways-sm.sbml", mdb.db, "/data/organismos/cruzi/pathways/", filter_file="allfilters_con_c.dat") biocyc = BioCyc(mdb.db) biocyc.user = BioMongoDB.demo biocyc.pre_build_index(SeqCollection.objects(name="cruzi").get()) build_statistics(mdb.db, "cruzi")
print "%s tiene pocas proteinas con ec anotados: %i" % (g.name, ecs) def validate_genome(g): validate_prots(g) for x in ["ec", "go"]: if db.col_ont_idx.count({ "ontology": x, "seq_collection_name": g.name }) == 0: print g.name + " sin indice " + x if __name__ == '__main__': BioMongoDB("saureus") genomes = list(Genome.objects(auth=BioMongoDB.demo_id)) assert 100 < len(genomes), len(genomes) no_stats = db.sequence_collection.count( {"statistics.0": { "$exists": False }}) if no_stats: print "there are %i genomes with no stats!!" % no_stats for g in genomes: validate_genome(g) print "-------------" print to_correct
:param parsed_orthologs: result of Mauve.parse_orthologs :return: """ count = {} for ortho in parsed_orthologs: if ref_num in ortho: count[ortho[ref_num]] = len(ortho) return count if __name__ == '__main__': from SNDG import init_log from SNDG.BioMongo.Process.BioMongoDB import BioMongoDB init_log() mdb = BioMongoDB("tdr", port=27018) datafile = "/data/organismos/SaureusN315/annotation/conservation/target_props.tsv" parsed_orthologs = Mauve.parse_orthologs( "/data/organismos/SaureusN315/annotation/conservation/ortologos_staphylo.csv" ) count = Mauve.count_orthologs(parsed_orthologs, "0") with open(datafile, "w") as h: h.write("id\tconserved_count\tconserved_percent\n") max_count = max(count.values()) for gene, count in count.items(): h.write(gene + "\t" + str(count) + "\t" + ("%0.2f" % (count * 1.0 / max_count)) + "\n") mdb.load_metadata("SaureusN315", datafile)
from SNDG.BioMongo.Process.Importer import from_ref_seq, update_proteins, import_prop_blast,common_annotations from SNDG.BioMongo.Process.BioDocFactory import BioDocFactory from SNDG.BioMongo.Model.Protein import Protein from SNDG.Network.KEGG import Kegg from SNDG.BioMongo.Process.Importer import _common_annotations, _protein_iter, import_kegg_annotation, \ index_seq_collection, build_statistics, load_pathways,common_annotations from BCBio import GFF from SNDG.BioMongo.Model.Structure import ModeledStructure, Molecule, ResidueAln, SimpleAlignment, StructureQuality, \ ExperimentalStructure,Chain from SNDG.BioMongo.Model.Alignment import AlnLine import os from SNDG.BioMongo.Process.StructureAnotator import StructureAnotator import Bio.SearchIO as bpsio mdb = BioMongoDB("tdr", port=27017) name = "tatro" organism = name org = "Trichoderma atroviride" ann_path = "/data/organismos/tatro/annotation/corrected.gb" # mdb.delete_seq_collection(name) # from_ref_seq(name, ann_path, cpus=6) # common_annotations(name, "/data/organismos/tatro/annotation/", cpu=6, remove_tmp=False) # mdb.protein_fasta("/data/organismos/" + name + "/annotation/proteins.faa", name) from SNDG.Annotation.EMapper import EMapper # em = EMapper() # em.read_file("proteins.") #update_proteins("/tmp/" + name + "/", "/data/organismos/" + name + "/annotation/proteins.faa", name, 1003200, db_init=mysqldb) #
assert 13 == len(genomes), len(genomes) for genome in genomes: dps = [ dp[0] for dp in SeqColDruggabilityParam.default_params + StructuromeIndexer.search_params + BioCyc.protein_pathway_search_params + BioCyc.pathways_search_params ] genome.druggabilityParams = [ x for x in genome.druggabilityParams if x.name not in dps ] for name, description, target, _type, options, _, _, _ in ( BioCyc.protein_pathway_search_params + BioCyc.pathways_search_params): dp = SeqColDruggabilityParam(name=name, description=description, target=target, type=_type, uploader="demo") genome.druggabilityParams.append(dp) biocyc = BioCyc(db) biocyc.user = "******" mdb = BioMongoDB("tdr") for g in genomes: validate_genome(g) print "OK"
logger = logging.getLogger('peewee') logger.setLevel(logging.INFO) init_log() parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter) parser.add_argument("-p", "--dbpass", required=True) parser.add_argument("-a", "--assemblyAccession", required=True) parser.add_argument("-mdb", "--mongodbname", required=True) parser.add_argument("-mydbtax", "--mysqldbtaxname", default="bioseqdb") parser.add_argument("--cpus", default=multiprocessing.cpu_count()) parser.add_argument("-mydbunip", "--mysqldbunip", default="unipmap") parser.add_argument("-myu", "--mysqldbuser", default="root") args = parser.parse_args() args.cpus = int(args.cpus) mdb = BioMongoDB(args.mongodbname) tax_db.initialize( MySQLDatabase(args.mysqldbtaxname, user=args.mysqldbuser, passwd=args.dbpass)) ProteinAnnotator.connect_to_db(database=args.mysqldbunip, user=args.mysqldbuser, password=args.dbpass) assert not mdb.seq_col_exists( args.assemblyAccession), "assembly already exists" Entrez.email = "*****@*****.**" assembly_id = Entrez.read( Entrez.esearch(db="assembly", term=args.assemblyAccession, retmax=1))["IdList"][0] resource = Entrez.read(
x.id + "_" + sf.qualifiers["locus_tag"][0].replace(tag, "") ] contigs.append(x) GFF.write(contigs, h, False) if __name__ == '__main__': init_log() logging.getLogger("peewee").setLevel(logging.WARN) from peewee import MySQLDatabase from SNDG.BioMongo.Process.Taxon import tax_db tax_db.initialize(MySQLDatabase('bioseqdb', user='******', passwd="mito")) mdb = BioMongoDB("saureus", port=27017) # mdb.delete_seq_collection("ILEX_PARA2") def extract_annotation_feature(feature): mrnas = [f for f in feature.sub_features if f.type == "mRNA"] return mrnas[0] if feature.type == "gene" and len(mrnas) else feature def accept_protein_feature(feature): return feature.type == "gene" and feature.sub_features and feature.sub_features[ 0].type == "mRNA" # prot_dict = bpio.to_dict(bpio.parse("/data/organismos/ILEX_PARA/contigs/ncbi_IP4.faa","fasta")) def extract_sequence(c, f): return prot_dict[f.id].seq
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter) parser.add_argument("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %(default)s]") parser.add_argument("-host", "--db_host", default='127.0.0.1') parser.add_argument("-db", "--db_name", default='tdr') parser.add_argument("--pdb_entries", default='/data/databases/pdb/entries.idx') parser.add_argument("--pdbs", default='/data/databases/pdb/') args = parser.parse_args() BioMongoDB(args.db_name) pdbUtils = PDBs(pdb_dir=args.pdbs) db = MongoClient(args.db_host)["pdb"] col_name = "pdb" if not os.path.exists(args.pdb_entries): sys.stderr.write("%s does not exists" % args.pdb_entries) sys.exit(1) """ collection = SeqCollection.objects(name=col_name) if len(collection): collection = collection.get() else: collection = SeqCollection(name=col_name, description="Protein Data Bank", organism="?")
def main(argv=None): # IGNORE:C0111 '''Command line options.''' if argv is None: argv = sys.argv else: sys.argv.extend(argv) parser = ArgumentParser( formatter_class=RawDescriptionHelpFormatter) parser.add_argument("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %(default)s]") # parser.add_argument("-dir", "--structs_dir", required = True ) parser.add_argument("-db", "--database_name", default='pdb') parser.add_argument("-host", "--db_host", default='127.0.0.1') parser.add_argument( "--csa", default='/data/databases/csa/csa.txt') parser.add_argument( "--hmm", default='/data/databases/pdb/pdb_seq_res.hmm') parser.add_argument( "--pdbs", default='/data/databases/pdb/') parser.add_argument( "--distances", default='/data/databases/pdb/processed/distances.tbl') args = parser.parse_args() # pdbs = PDBs() # pdbs.update('/data/pdb/divided/') BioMongoDB(args.database_name) #args.db_host # update_quaternary() # # clusters cd hit # update_clusters() # # residues near ligands --> metal drug/cofactor if not os.path.exists(args.csa): sys.stderr.write("%s not found. Download it from %s" % ( args.csa, "http://www.ebi.ac.uk/thornton-srv/databases/CSA/downloads/CSA_2_0_121113.txt" )) sys.exit(1) if not os.path.exists(args.pdbs): sys.stderr.write("%s not found. Specify where is pdbs/divided directory" % ( args.pdbs )) sys.exit(1) if not os.path.exists(args.distances): sys.stderr.write("%s not found. Run extended_domain.py script to create it." % ( args.distances )) sys.exit(1) pdbUtils = PDBs(pdb_dir=args.pdbs) print("Update Quaternary") update_quaternary(pdbUtils) print("Update CSA") update_csa(args.csa) print("Update CYS/TYR") free_cys_tyr(pdbUtils) print("Update Importan Pfam") important_pfam(args.hmm) print("Update Binding residues") update_binding_residues(args.distances) _log.info("update pdb properties finished!!")
def main(argv=None): # IGNORE:C0111 program_version = "v%s" % __version__ program_build_date = str(__updated__) program_version_message = '%%(prog)s %s (%s)' % (program_version, program_build_date) program_shortdesc = __import__('__main__').__doc__.split("\n")[1] program_license = '''%s Created by user_name on %s. Copyright 2015 BIA. All rights reserved. Licensed under the Apache License 2.0 http://www.apache.org/licenses/LICENSE-2.0 Distributed on an "AS IS" basis without warranties or conditions of any kind, either express or implied. USAGE ''' % (program_shortdesc, str(__date__)) parser = ArgumentParser(description=program_license, formatter_class=RawDescriptionHelpFormatter) parser.add_argument("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %(default)s]") parser.add_argument("-db_structure", "--db_structure", help="Mongo structure db", default='pdb') parser.add_argument("-db_genome", "--db_genome", help="Mongo proteins db", default='saureus') parser.add_argument('-o', '--overwrite', default=True, action='store_true') parser.add_argument("-host", "--db_host", default='127.0.0.1') parser.add_argument('-V', '--version', action='version', version=program_version_message) args = parser.parse_args() db = pymongo.MongoClient(args.db_host)[args.db_structure] BioMongoDB(args.db_genome) logging.getLogger("peewee").setLevel(logging.WARN) from peewee import MySQLDatabase from SNDG.BioMongo.Process.Taxon import tax_db tax_db.initialize(MySQLDatabase('bioseqdb', user='******', passwd="mito")) tax_cache = {} for t in Taxonomy.objects().no_cache(): for n in t.names: tax_cache[n.lower()] = t tax_cache[t.ncbi_taxon_id] = t query = {} idx_name = "sndg_index" if not args.overwrite: query = {idx_name: {"$exists": 0}} # total = db.structures.count(query) # with tqdm(db.structures.find(query, {"organism": 1}), total=total) as pbar: # for struct in pbar: # if "organism" in struct: # for org in [x for x in set(struct["organism"].lower().split(";") + struct["organism"].lower().split(",") + # [struct["organism"].lower().split("(")[0]]) if ";" not in x and "," not in x and "(" not in x]: # org = org.strip() # val = get_or_load_by_name(org, tax_cache) # if val: # db.structures.update({"_id": struct["_id"]}, {"$set": {idx_name + ".tax": list(val.keywords)}}) # else: # tax_cache[org.lower()] = None # _log.warn(org + " not found") # db.structures.update({"ligands.0":{"$exists",1}}, {"$set": {idx_name + ".ligand": 1}},multi=True); db = pymongo.MongoClient(args.db_host)[args.db_genome] # total = db.barcodes.count(query) # with tqdm(db.barcodes.find(query, {"tax": 1}), total=total) as pbar: # for barcode in pbar: # val = get_or_load_by_id(barcode["tax"], tax_cache) # update_element(val, db.barcodes, barcode, idx_name, tax_cache,barcode["tax"]) total = db.sequence_collection.count(query) with tqdm(db.sequence_collection.find(query, { "name": 1, "tax": 1, "assemblyStatus": 1 }, no_cursor_timeout=True), total=total) as pbar: for genome in pbar: if "tax" in genome: val = get_or_load_by_id(int(genome["tax"]["tid"]), tax_cache) update_element(val, db.sequence_collection, genome, idx_name, tax_cache, genome["tax"]["tid"]) if val: select = {"organism": genome["name"]} kws = list(val.keywords) db.proteins.update(select, {"$set": { idx_name + ".tax": kws }}, multi=True) db.proteins.update( select, {"$addToSet": { "keywords": { "$each": kws } }}, multi=True) db.contig_collection.update(select, { "$set": { idx_name + ".tax": kws, idx_name + ".assemblyStatus": genome["assemblyStatus"] } }, multi=True) db.contig_collection.update( select, {"$addToSet": { "keywords": { "$each": kws } }}, multi=True) print("Ok")