Beispiel #1
0
 def offtarget(organism,
               offtarget_databases,
               offtarget_names,
               tmp_dir=None):
     if not tmp_dir:
         tmp_dir = "/data/organismos/" + organism + "/annotation/"
     mkdir(tmp_dir)
     proteins = tmp_dir + "proteins.fasta"
     if not os.path.exists(proteins):
         BioMongoDB.protein_fasta(proteins, organism)
     results = Offtarget.offtargets(proteins, tmp_dir, offtarget_databases)
     for i, name in enumerate(offtarget_names):
         load_blast_features(organism,
                             results[i],
                             name,
                             min_identity=0.4,
                             min_query_coverage=0.4,
                             min_hit_coverage=0.4)
def main(argv=None):  # IGNORE:C0111

    program_version = "v%s" % __version__
    program_build_date = str(__updated__)
    program_version_message = '%%(prog)s %s (%s)' % (program_version, program_build_date)
    program_shortdesc = __import__('__main__').__doc__.split("\n")[1]
    program_license = '''%s

  Created by user_name on %s.
  Copyright 2015 BIA. All rights reserved.

  Licensed under the Apache License 2.0
  http://www.apache.org/licenses/LICENSE-2.0

  Distributed on an "AS IS" basis without warranties
  or conditions of any kind, either express or implied.

USAGE
''' % (program_shortdesc, str(__date__))

    parser = ArgumentParser(description=program_license, formatter_class=RawDescriptionHelpFormatter)
    parser.add_argument("-v", "--verbose", dest="verbose", action="count",
                        help="set verbosity level [default: %(default)s]")

    parser.add_argument("-n", "--name", required=True)
    parser.add_argument("-dir", "--structs_dir", required=True)
    parser.add_argument("-db_structure", "--db_structure",help="Mongo structure db", default='pdb')
    parser.add_argument("-db_genome", "--db_genome",help="Mongo proteins db", default='xomeq')
    parser.add_argument("-host", "--db_host", default='127.0.0.1')
    parser.add_argument('-V', '--version', action='version', version=program_version_message)

    args = parser.parse_args()

    BioMongoDB(args.db_genome)
    db = pymongo.MongoClient(args.db_host)[args.db_structure]

    sa = StructureAnotator(args.structs_dir + "/")
    total = sa.total(db, args.name, {})

    with tqdm(sa.iterator(db, args.name, {}), total=total) as pbar:
        for model in pbar:
            pbar.set_description(model.name)

            template = model.templates[0]

            try:
                protein = Protein.objects(organism=args.name, alias=template.aln_query.name).get()
            except DoesNotExist:
                _log.warn(template.aln_query.name + " does not exists")
            sa.annotate_model(model, protein.domains())
            model.save()
Beispiel #3
0
                ) / len(reactions_with_gene)

            ont = self.db.ontologies.find_one({"term": pw.lower()})
            if ont:
                name = ont["name"]
            else:
                name = pw

            pw_obj = PathwaySumary(term=pw,
                                   name=name,
                                   count=pws_dict[pw]["genes"],
                                   properties=pws_dict[pw])

            self.pathways.append(pw_obj)


if __name__ == "__main__":
    from SNDG.BioMongo.Process.BioMongoDB import BioMongoDB
    from SNDG.BioMongo.Process.Importer import _common_annotations, _protein_iter, import_kegg_annotation, \
        index_seq_collection, build_statistics, load_pathways

    mdb = BioMongoDB("tdr", port=27018)
    # ps = PathwaysAnnotator(mdb.db, "SaureusN315", "/data/organismos/SaureusN315/pathways/")
    # ps.sbml("Red_Staphylo_Curada_rs.sbml")
    # ps.species_filter("allfilters_con_c.dat")
    # ps.extract_genes_from_notes(lambda notes: gene_name_regexp.findall(notes))
    # ps.annotate()
    # index_seq_collection(mdb.db, "SaureusN315", pathways=True, go=True, keywords=True, ec=True, organism_idx=True,
    #                      structure=False)
    build_statistics(mdb.db, "SaureusN315")
Beispiel #4
0
from SNDG.BioMongo.Process.BioDocFactory import BioDocFactory
from SNDG.BioMongo.Model.Protein import Protein, ChEMBL
from SNDG.Network.KEGG import Kegg
from SNDG.BioMongo.Process.Importer import _common_annotations, _protein_iter, import_kegg_annotation, \
    index_seq_collection, build_statistics, load_pathways
from BCBio import GFF
from SNDG.BioMongo.Process.Taxon import Tax
from SNDG.BioMongo.Model.Structure import ModeledStructure, Molecule, ResidueAln, SimpleAlignment, StructureQuality, \
    ExperimentalStructure, Chain,SeqCollection
from SNDG.BioMongo.Model.Alignment import AlnLine
import os
from SNDG.BioMongo.Process.StructureAnotator import StructureAnotator
import Bio.SearchIO as bpsio
from Bio.SeqUtils import seq1, seq3
tax_db.initialize(MySQLDatabase('bioseqdb', user='******', passwd="mito"))
mdb = BioMongoDB("tdr", port=27017)
mysqldb = ProteinAnnotator.connect_to_db(database="unipmap",
                                         user="******",
                                         password="******")

orgs = [
    ("Mpylori26695", "Helicobacter pylori 26695 (e-proteobacteria)",
     "/data/organismos/Mpylori26695/GCF_000008525.1_ASM852v1_genomic.gbff",
     85962),
    ("MpyloriIndia", "Helicobacter pylori India7 (e-proteobacteria)",
     "/data/organismos/MpyloriIndia/GCF_000185185.1_ASM18518v1_genomic.gbff",
     907238),
]

for name, org, ann_path, tax in orgs:
    organism = name
Beispiel #5
0
from SNDG.BioMongo.Process.Importer import from_ref_seq, update_proteins, import_prop_blast
from SNDG.BioMongo.Process.BioDocFactory import BioDocFactory
from SNDG.BioMongo.Model.Protein import Protein
from SNDG.Network.KEGG import Kegg
from SNDG.BioMongo.Process.Importer import _common_annotations, _protein_iter, import_kegg_annotation, \
    index_seq_collection, build_statistics, load_pathways
from BCBio import GFF

from SNDG.BioMongo.Model.Structure import ModeledStructure, Molecule, ResidueAln, SimpleAlignment, StructureQuality, \
    ExperimentalStructure,Chain
from SNDG.BioMongo.Model.Alignment import AlnLine
import os
from SNDG.BioMongo.Process.StructureAnotator import StructureAnotator
import Bio.SearchIO as bpsio

mdb = BioMongoDB("tdr", port=27017)

name = "Ainsu2"
organism = name
org = "Achromobacter insuavis AXX-A"
ann_path = "/data/organismos/Ainsu/GCF_000219745.1_ASM21974v1_genomic.gbff"
#
from_ref_seq(name, ann_path, cpus=3)
mdb.protein_fasta("/data/organismos/" + name + "/annotation/proteins.faa",
                  name)
# from SNDG.Annotation.EMapper import EMapper
# em = EMapper()
# em.read_file("proteins.")
#update_proteins("/tmp/" + name + "/", "/data/organismos/" + name + "/annotation/proteins.faa", name, 1003200, db_init=mysqldb)
#
#
Beispiel #6
0
from SNDG.BioMongo.Process.BioMongoDB import BioMongoDB
from SNDG.BioMongo.Process.Importer import index_seq_collection, build_statistics
import pymongo
from tqdm import tqdm

mdb = BioMongoDB("saureus", 27019)

## Script para aplicar el curado manual de fede
data = open(
    "/data/organismos/ILEX_PARA2/curacion/24082018_auto.txt").read().split("#")
import re

# ecex = re.compile("^ec")
# for l in tqdm(data):
#     genes, desc, ec = [x.strip() for x in l.strip().split("\n") if x]
#     genes = genes.split("==")
#     #    try:
#     gs = [mdb.db.proteins.find_one({"organism": "ILEX_PARA", "alias": x.strip()}, {"gene": 1})["gene"][0] for x in genes if
#           x.startswith("Ilex")]
#     #    except:
#     #        print(l.strip().split("\n"))
#     ts = [x.strip() for x in genes if x.startswith("ILEX")]
#
#     for g in gs:
#         sets = {"description": desc}
#         if "Caffeine synthase" in desc:
#             num = ""
#             if len(desc.split(" ")) == 3:
#                 num = desc.split(" ")[2]
#             sets["gene"] = [g, "CS" + num]
#             sets["name"] = "CS" + num
Beispiel #7
0
from argparse import RawDescriptionHelpFormatter

os.environ["COMPOUND_TYPES_PATH"] = os.getenv(
    'COMPOUND_TYPES_PATH', "/target/data/compound_type.csv")
from SNDG.BioMongo.Process.BioMongoDB import BioMongoDB

if __name__ == "__main__":
    argv = sys.argv

    parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter)
    parser.add_argument("-v",
                        "--verbose",
                        dest="verbose",
                        action="count",
                        help="set verbosity level [default: %(default)s]")

    parser.add_argument("-host", "--db_host", default='127.0.0.1')
    parser.add_argument("--port", default=27017)
    parser.add_argument("-db", "--db_name", default='tdr')
    parser.add_argument("--pdbs_path", required=True)
    parser.add_argument("--organism_name", required=True)
    parser.add_argument("--remove_tmp", action='store_true')
    parser.add_argument("--cpu", default=4)
    parser.add_argument("--tmp_dir", default="./annotation/")

    args = parser.parse_args()

    mdb = BioMongoDB(args.db_name, port=args.port, host=args.db_host)
    _common_annotations(args.organism_name, args.tmp_dir, args.cpu,
                        args.remove_tmp, True, False, None, args.pdbs_path)
from SNDG.BioMongo.Process.BioMongoDB import BioMongoDB
from SNDG.BioMongo.Process.Importer import from_ref_seq, update_proteins, create_proteome
from SNDG.BioMongo.Process.Taxon import tax_db
from SNDG.WebServices.NCBI import ExternalAssembly, mysql_db
from peewee import MySQLDatabase
from SNDG.Sequence.ProteinAnnotator import ProteinAnnotator, Mapping
from SNDG.BioMongo.Process.Index import index_seq_collection, build_statistics

Entrez.email = "*****@*****.**"
_log = logging.getLogger(__name__)
if __name__ == "__main__":
    logger = logging.getLogger('peewee')
    logger.setLevel(logging.INFO)
    init_log()

    mdb = BioMongoDB("saureus")
    tax_db.initialize(MySQLDatabase('bioseqdb', user='******', passwd="mito"))
    mysql_db.initialize(MySQLDatabase('sndg', user='******', passwd="mito"))
    assemblies = list(ExternalAssembly.select().where(
        ExternalAssembly.sample_source.is_null(False)))

    ProteinAnnotator.connect_to_db(database="unipmap",
                                   user="******",
                                   password="******")
    with tqdm(assemblies) as pbar:
        for x in pbar:
            if mdb.seq_col_exists(x.assembly_accession):
                continue
            pbar.set_description(x.assembly_accession)
            try:
                dst_dir = "/data/organismos/" + x.assembly_accession + "/annotation/"
Beispiel #9
0
def qa (model_path):
    ...:     if not os.path.exists(model_path + ".json"):
    ...:         assessment = QMean.assesment(model_path)
    ...:         with open(model_path + ".json", "w") as h:
    ...:             json.dump(assessment, h)
    ...:             
    ...: p = Pool(3)
    ...: list(tqdm(p.imap_unordered(qa,model_files,100)))

"""

models_count = len(model_files)

seq_col_id = ObjectId("5b2800b1be737e35a6dd9b8a")

BioMongoDB("tdr")
db = pymongo.MongoClient().pdb

# with tqdm(model_files) as pbar:
#     for model_file in pbar:
#         model_name = model_file.split("/")[-2]
#         pbar.set_description("processing %s" % model_name)
#
#         seq_name = model_file.split("/")[-3]
#         aln = [hit[0] for hit in list(bpsio.read(basepath + "/" + seq_name + "/profile_search.xml", "blast-xml")) if
#                hit.id == model_name.split(seq_name + "_")[1]][0]
#
#         with open(model_file + ".json") as h:
#             assessments = json.load(h)
#         pockets = []
#
Beispiel #10
0
        with open(track_list_path, "w") as handle:
            json.dump(data, handle, indent=4, separators=(',', ': '))


if __name__ == "__main__":
    import argparse
    import SNDG

    init_log()
    parser = argparse.ArgumentParser(description='Profile utils')
    parser.add_argument('--db',
                        default="tdr",
                        help='database name. default tdr')
    parser.add_argument('--name', required=True, help='organism name')
    args = parser.parse_args()

    SNDG.DEFAULT_SNDG_EXEC_MODE = "raw"
    mdb = BioMongoDB(args.db)
    jw = JBrowse(db=mdb.db)

    jw.create_genome(args.name)
    print("se crearon los archivos /tmp/jbrowse_g.gff y /tmp/jbrowse_g.fasta")

    # jw.load_sequences("/data/organismos/Pext14-3B/annotation//GCF_000242115.1_Pext14-3B_1.0_genomic.gbff")
    # jw.create_genome("Pext14-3B")

#     for s in [ "15-6324_S3_L001","2003_S4_L001"]:
#         vcf = "/data/projects/PiuriTB/analysis/variant_call_h37/" + s + "/variants.ann.vcf"
#         bam = "/data/projects/PiuriTB/analysis/reads_h37rv_aln/" + s + "/final_bwa.bam"
#         jw.add_strain("H37Rv",s, vcf , bam)
Beispiel #11
0
from SNDG.BioMongo.Process.BioMongoDB import BioMongoDB
from SNDG.BioMongo.Model.SeqCollection import SeqCollection, SeqColDruggabilityParam
from SNDG.WebServices.Offtargeting import Offtargeting
from SNDG import init_log, mkdir, execute
from SNDG.WebServices import PROXIES
import os

PROXIES["ftp_proxy"] = "http://proxy.fcen.uba.ar:8080"
init_log()

mdb = BioMongoDB("tdr", port=27018)

off_props = {
    "human_offtarget":
    SeqColDruggabilityParam(
        **{
            "target":
            "protein",
            "defaultGroupOperation":
            "max",
            "defaultValue":
            0.6,
            "name":
            "human_offtarget",
            "defaultOperation":
            ">",
            "_cls":
            "SeqColDruggabilityParam",
            "uploader":
            "demo",
            "_class":
Beispiel #12
0
    parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter)
    parser.add_argument("-v", "--verbose", dest="verbose", action="count",
                        help="set verbosity level [default: %(default)s]")

    parser.add_argument("-host", "--db_host", default='127.0.0.1')
    parser.add_argument("-db", "--db_name", default='tdr')
    parser.add_argument( "--pdb_entries", default='/data/databases/pdb/entries.idx')
    parser.add_argument( "--pdbs", default='/data/databases/pdb/')
    parser.add_argument( "--pdb_timeout", default=60,type=int)
    parser.add_argument( "--review_pockets", action="store_true")
    parser.add_argument("--organism_name", default=None)


    args = parser.parse_args()

    mdb = BioMongoDB(args.db_name,host=args.db_host)

    pdbUtils = PDBs(pdb_dir=args.pdbs)

    db = MongoClient(args.db_host)["pdb"]
    col_name = "pdb"

    if not os.path.exists(args.pdb_entries):
        sys.stderr.write("%s does not exists" % args.pdb_entries)
        sys.exit(1)

    """
    collection = SeqCollection.objects(name=col_name)
    if len(collection):
        collection = collection.get()
    else:
    parser.add_argument("-l", "--log_path", default=None)

    args = parser.parse_args()
    _log = logging.getLogger("protein_annotation")

    if not args.log_path:
        args.log_path = "/tmp/annotation.log"
    init_log(args.log_path, logging.INFO)

    pa = ProteinAnnotator()
    pa.connect_to_db(database=args.db_annotation,
                     user=args.user_db,
                     password=args.user_pass)

    BioMongoDB(args.mongo_db, host=args.mdb_host, port=args.mdb_port)

    if not os.path.exists(args.blast):
        _log.info(args.blast + " does not exists, running blast...")
        if args.fasta:
            assert os.path.exists(args.fasta), args.fasta + " does not exists"
            fasta = args.fasta
        else:
            _log.info("no fasta input, using proteins from the mongo db")
            fasta = tempfile.mktemp()
            with open(fasta, "w") as h:
                for p in Protein.objects(organism=args.organism):
                    r = SeqRecord(id=p.gene[0], description="", seq=Seq(p.seq))
                    bpio.write(r, h, "fasta")

        execute(
Beispiel #14
0
from SNDG.BioMongo.Process.Importer import load_pathways, build_statistics
from SNDG.BioMongo.Process.BioCyc2Mongo import BioCyc
from SNDG.BioMongo.Process.BioMongoDB import BioMongoDB
from SNDG.BioMongo.Model.SeqCollection import SeqCollection
import pymongo

port = 27018
mdb = BioMongoDB("tdr", port=port)
db = pymongo.MongoClient(port=port).pdb

load_pathways("cruzi",
              "/data/organismos/cruzi/pathways/pathways-sm.sbml",
              mdb.db,
              "/data/organismos/cruzi/pathways/",
              filter_file="allfilters_con_c.dat")
biocyc = BioCyc(mdb.db)
biocyc.user = BioMongoDB.demo
biocyc.pre_build_index(SeqCollection.objects(name="cruzi").get())
build_statistics(mdb.db, "cruzi")
        print "%s tiene pocas proteinas con ec anotados: %i" % (g.name, ecs)


def validate_genome(g):

    validate_prots(g)
    for x in ["ec", "go"]:
        if db.col_ont_idx.count({
                "ontology": x,
                "seq_collection_name": g.name
        }) == 0:
            print g.name + " sin indice " + x


if __name__ == '__main__':
    BioMongoDB("saureus")
    genomes = list(Genome.objects(auth=BioMongoDB.demo_id))

    assert 100 < len(genomes), len(genomes)

    no_stats = db.sequence_collection.count(
        {"statistics.0": {
            "$exists": False
        }})
    if no_stats:
        print "there are %i genomes with no stats!!" % no_stats
    for g in genomes:
        validate_genome(g)

    print "-------------"
    print to_correct
Beispiel #16
0
        :param parsed_orthologs: result of Mauve.parse_orthologs
        :return:
        """
        count = {}
        for ortho in parsed_orthologs:
            if ref_num in ortho:
                count[ortho[ref_num]] = len(ortho)
        return count


if __name__ == '__main__':
    from SNDG import init_log
    from SNDG.BioMongo.Process.BioMongoDB import BioMongoDB

    init_log()
    mdb = BioMongoDB("tdr", port=27018)
    datafile = "/data/organismos/SaureusN315/annotation/conservation/target_props.tsv"
    parsed_orthologs = Mauve.parse_orthologs(
        "/data/organismos/SaureusN315/annotation/conservation/ortologos_staphylo.csv"
    )
    count = Mauve.count_orthologs(parsed_orthologs, "0")
    with open(datafile, "w") as h:
        h.write("id\tconserved_count\tconserved_percent\n")
        max_count = max(count.values())
        for gene, count in count.items():
            h.write(gene + "\t" + str(count) + "\t" +
                    ("%0.2f" % (count * 1.0 / max_count)) + "\n")

    mdb.load_metadata("SaureusN315", datafile)
Beispiel #17
0
from SNDG.BioMongo.Process.Importer import from_ref_seq, update_proteins, import_prop_blast,common_annotations
from SNDG.BioMongo.Process.BioDocFactory import BioDocFactory
from SNDG.BioMongo.Model.Protein import Protein
from SNDG.Network.KEGG import Kegg
from SNDG.BioMongo.Process.Importer import _common_annotations, _protein_iter, import_kegg_annotation, \
    index_seq_collection, build_statistics, load_pathways,common_annotations
from BCBio import GFF

from SNDG.BioMongo.Model.Structure import ModeledStructure, Molecule, ResidueAln, SimpleAlignment, StructureQuality, \
    ExperimentalStructure,Chain
from SNDG.BioMongo.Model.Alignment import AlnLine
import os
from SNDG.BioMongo.Process.StructureAnotator import StructureAnotator
import Bio.SearchIO as bpsio

mdb = BioMongoDB("tdr", port=27017)


name = "tatro"
organism = name
org = "Trichoderma atroviride"
ann_path = "/data/organismos/tatro/annotation/corrected.gb"
# mdb.delete_seq_collection(name)
# from_ref_seq(name, ann_path,  cpus=6)
# common_annotations(name, "/data/organismos/tatro/annotation/", cpu=6, remove_tmp=False)
# mdb.protein_fasta("/data/organismos/" + name + "/annotation/proteins.faa", name)
from SNDG.Annotation.EMapper import EMapper
# em = EMapper()
# em.read_file("proteins.")
#update_proteins("/tmp/" + name + "/", "/data/organismos/" + name + "/annotation/proteins.faa", name, 1003200, db_init=mysqldb)
#
    assert 13 == len(genomes), len(genomes)

    for genome in genomes:
        dps = [
            dp[0] for dp in SeqColDruggabilityParam.default_params +
            StructuromeIndexer.search_params +
            BioCyc.protein_pathway_search_params +
            BioCyc.pathways_search_params
        ]
        genome.druggabilityParams = [
            x for x in genome.druggabilityParams if x.name not in dps
        ]
        for name, description, target, _type, options, _, _, _ in (
                BioCyc.protein_pathway_search_params +
                BioCyc.pathways_search_params):
            dp = SeqColDruggabilityParam(name=name,
                                         description=description,
                                         target=target,
                                         type=_type,
                                         uploader="demo")
            genome.druggabilityParams.append(dp)

    biocyc = BioCyc(db)
    biocyc.user = "******"

    mdb = BioMongoDB("tdr")

    for g in genomes:
        validate_genome(g)
    print "OK"
    logger = logging.getLogger('peewee')
    logger.setLevel(logging.INFO)
    init_log()

    parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter)
    parser.add_argument("-p", "--dbpass", required=True)
    parser.add_argument("-a", "--assemblyAccession", required=True)
    parser.add_argument("-mdb", "--mongodbname", required=True)
    parser.add_argument("-mydbtax", "--mysqldbtaxname", default="bioseqdb")
    parser.add_argument("--cpus", default=multiprocessing.cpu_count())
    parser.add_argument("-mydbunip", "--mysqldbunip", default="unipmap")
    parser.add_argument("-myu", "--mysqldbuser", default="root")

    args = parser.parse_args()
    args.cpus = int(args.cpus)
    mdb = BioMongoDB(args.mongodbname)
    tax_db.initialize(
        MySQLDatabase(args.mysqldbtaxname,
                      user=args.mysqldbuser,
                      passwd=args.dbpass))
    ProteinAnnotator.connect_to_db(database=args.mysqldbunip,
                                   user=args.mysqldbuser,
                                   password=args.dbpass)

    assert not mdb.seq_col_exists(
        args.assemblyAccession), "assembly already exists"
    Entrez.email = "*****@*****.**"
    assembly_id = Entrez.read(
        Entrez.esearch(db="assembly", term=args.assemblyAccession,
                       retmax=1))["IdList"][0]
    resource = Entrez.read(
Beispiel #20
0
                            x.id + "_" +
                            sf.qualifiers["locus_tag"][0].replace(tag, "")
                        ]
            contigs.append(x)
        GFF.write(contigs, h, False)


if __name__ == '__main__':
    init_log()

    logging.getLogger("peewee").setLevel(logging.WARN)
    from peewee import MySQLDatabase
    from SNDG.BioMongo.Process.Taxon import tax_db

    tax_db.initialize(MySQLDatabase('bioseqdb', user='******', passwd="mito"))
    mdb = BioMongoDB("saureus", port=27017)

    # mdb.delete_seq_collection("ILEX_PARA2")


    def extract_annotation_feature(feature):
        mrnas = [f for f in feature.sub_features if f.type == "mRNA"]
        return mrnas[0] if feature.type == "gene" and len(mrnas) else feature

    def accept_protein_feature(feature):
        return feature.type == "gene" and feature.sub_features and feature.sub_features[
            0].type == "mRNA"

    # prot_dict = bpio.to_dict(bpio.parse("/data/organismos/ILEX_PARA/contigs/ncbi_IP4.faa","fasta"))
    def extract_sequence(c, f):
        return prot_dict[f.id].seq
Beispiel #21
0
    parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter)
    parser.add_argument("-v",
                        "--verbose",
                        dest="verbose",
                        action="count",
                        help="set verbosity level [default: %(default)s]")

    parser.add_argument("-host", "--db_host", default='127.0.0.1')
    parser.add_argument("-db", "--db_name", default='tdr')
    parser.add_argument("--pdb_entries",
                        default='/data/databases/pdb/entries.idx')
    parser.add_argument("--pdbs", default='/data/databases/pdb/')

    args = parser.parse_args()

    BioMongoDB(args.db_name)

    pdbUtils = PDBs(pdb_dir=args.pdbs)

    db = MongoClient(args.db_host)["pdb"]
    col_name = "pdb"

    if not os.path.exists(args.pdb_entries):
        sys.stderr.write("%s does not exists" % args.pdb_entries)
        sys.exit(1)
    """
    collection = SeqCollection.objects(name=col_name)
    if len(collection):
        collection = collection.get()
    else:
        collection = SeqCollection(name=col_name, description="Protein Data Bank", organism="?")
def main(argv=None):  # IGNORE:C0111
    '''Command line options.'''

    if argv is None:
        argv = sys.argv
    else:
        sys.argv.extend(argv)



    parser = ArgumentParser( formatter_class=RawDescriptionHelpFormatter)
    parser.add_argument("-v", "--verbose", dest="verbose", action="count",
                        help="set verbosity level [default: %(default)s]")

    # parser.add_argument("-dir", "--structs_dir", required = True )
    parser.add_argument("-db", "--database_name", default='pdb')
    parser.add_argument("-host", "--db_host", default='127.0.0.1')

    parser.add_argument( "--csa", default='/data/databases/csa/csa.txt')
    parser.add_argument( "--hmm", default='/data/databases/pdb/pdb_seq_res.hmm')
    parser.add_argument( "--pdbs", default='/data/databases/pdb/')
    parser.add_argument( "--distances", default='/data/databases/pdb/processed/distances.tbl')


    args = parser.parse_args()


    #         pdbs = PDBs()
    #         pdbs.update('/data/pdb/divided/')

    BioMongoDB(args.database_name) #args.db_host

    # update_quaternary()
    #         # clusters cd hit
    #         update_clusters()
    #
    # residues near ligands --> metal drug/cofactor

    if not os.path.exists(args.csa):
        sys.stderr.write("%s not found. Download it from %s" % (
            args.csa,
            "http://www.ebi.ac.uk/thornton-srv/databases/CSA/downloads/CSA_2_0_121113.txt"
        ))
        sys.exit(1)

    if not os.path.exists(args.pdbs):
        sys.stderr.write("%s not found. Specify where is pdbs/divided directory" % (
            args.pdbs
        ))
        sys.exit(1)
    if not os.path.exists(args.distances):
        sys.stderr.write("%s not found. Run extended_domain.py script to create it." % (
            args.distances
        ))
        sys.exit(1)


    pdbUtils = PDBs(pdb_dir=args.pdbs)
    print("Update Quaternary")
    update_quaternary(pdbUtils)
    print("Update CSA")
    update_csa(args.csa)
    print("Update CYS/TYR")
    free_cys_tyr(pdbUtils)


    print("Update Importan Pfam")
    important_pfam(args.hmm)
    print("Update Binding residues")
    update_binding_residues(args.distances)
    _log.info("update pdb properties finished!!")
Beispiel #23
0
def main(argv=None):  # IGNORE:C0111

    program_version = "v%s" % __version__
    program_build_date = str(__updated__)
    program_version_message = '%%(prog)s %s (%s)' % (program_version,
                                                     program_build_date)
    program_shortdesc = __import__('__main__').__doc__.split("\n")[1]
    program_license = '''%s

  Created by user_name on %s.
  Copyright 2015 BIA. All rights reserved.

  Licensed under the Apache License 2.0
  http://www.apache.org/licenses/LICENSE-2.0

  Distributed on an "AS IS" basis without warranties
  or conditions of any kind, either express or implied.

USAGE
''' % (program_shortdesc, str(__date__))

    parser = ArgumentParser(description=program_license,
                            formatter_class=RawDescriptionHelpFormatter)
    parser.add_argument("-v",
                        "--verbose",
                        dest="verbose",
                        action="count",
                        help="set verbosity level [default: %(default)s]")

    parser.add_argument("-db_structure",
                        "--db_structure",
                        help="Mongo structure db",
                        default='pdb')
    parser.add_argument("-db_genome",
                        "--db_genome",
                        help="Mongo proteins db",
                        default='saureus')
    parser.add_argument('-o', '--overwrite', default=True, action='store_true')
    parser.add_argument("-host", "--db_host", default='127.0.0.1')
    parser.add_argument('-V',
                        '--version',
                        action='version',
                        version=program_version_message)

    args = parser.parse_args()

    db = pymongo.MongoClient(args.db_host)[args.db_structure]
    BioMongoDB(args.db_genome)
    logging.getLogger("peewee").setLevel(logging.WARN)
    from peewee import MySQLDatabase
    from SNDG.BioMongo.Process.Taxon import tax_db
    tax_db.initialize(MySQLDatabase('bioseqdb', user='******', passwd="mito"))

    tax_cache = {}
    for t in Taxonomy.objects().no_cache():
        for n in t.names:
            tax_cache[n.lower()] = t
            tax_cache[t.ncbi_taxon_id] = t
    query = {}
    idx_name = "sndg_index"
    if not args.overwrite:
        query = {idx_name: {"$exists": 0}}
    # total = db.structures.count(query)
    # with tqdm(db.structures.find(query, {"organism": 1}), total=total) as pbar:
    #     for struct in pbar:
    #         if "organism" in struct:
    #             for org in [x for x in set(struct["organism"].lower().split(";") + struct["organism"].lower().split(",") +
    #                   [struct["organism"].lower().split("(")[0]]) if ";" not in x and "," not in x and "(" not in x]:
    #                 org = org.strip()
    #                 val = get_or_load_by_name(org, tax_cache)
    #                 if val:
    #                     db.structures.update({"_id": struct["_id"]}, {"$set": {idx_name + ".tax": list(val.keywords)}})
    #                 else:
    #                     tax_cache[org.lower()] = None
    #                     _log.warn(org + " not found")

    # db.structures.update({"ligands.0":{"$exists",1}},  {"$set": {idx_name + ".ligand": 1}},multi=True);

    db = pymongo.MongoClient(args.db_host)[args.db_genome]

    # total = db.barcodes.count(query)
    # with tqdm(db.barcodes.find(query, {"tax": 1}), total=total) as pbar:
    #     for barcode in pbar:
    #         val = get_or_load_by_id(barcode["tax"], tax_cache)
    #         update_element(val, db.barcodes, barcode, idx_name, tax_cache,barcode["tax"])

    total = db.sequence_collection.count(query)
    with tqdm(db.sequence_collection.find(query, {
            "name": 1,
            "tax": 1,
            "assemblyStatus": 1
    },
                                          no_cursor_timeout=True),
              total=total) as pbar:
        for genome in pbar:
            if "tax" in genome:
                val = get_or_load_by_id(int(genome["tax"]["tid"]), tax_cache)
                update_element(val, db.sequence_collection, genome, idx_name,
                               tax_cache, genome["tax"]["tid"])
                if val:
                    select = {"organism": genome["name"]}
                    kws = list(val.keywords)
                    db.proteins.update(select,
                                       {"$set": {
                                           idx_name + ".tax": kws
                                       }},
                                       multi=True)
                    db.proteins.update(
                        select, {"$addToSet": {
                            "keywords": {
                                "$each": kws
                            }
                        }},
                        multi=True)
                    db.contig_collection.update(select, {
                        "$set": {
                            idx_name + ".tax": kws,
                            idx_name + ".assemblyStatus":
                            genome["assemblyStatus"]
                        }
                    },
                                                multi=True)
                    db.contig_collection.update(
                        select, {"$addToSet": {
                            "keywords": {
                                "$each": kws
                            }
                        }},
                        multi=True)

    print("Ok")