Example #1
0
 def save(self, file_path):
     mkdir(os.path.dirname(os.path.abspath(file_path)))
     with open(file_path, "w") as handle:
         json.dump(
             [{"number": p.pocket_num, "residues": p.residues, "as_lines": p.alpha_spheres, "atoms": p.atoms,
               "properties": p.properties}
              for p in self.pockets if p.properties["Druggability Score"] > 0.2], handle)
Example #2
0
    def trim(strains,
             source_dir,
             dst_dir,
             clip="",
             headcrop=13,
             quality=20,
             windowsize=4,
             minlen=36):
        """

        :param strains:
        :param source_dir:
        :param dst_dir:
        :param clip: ILLUMINACLIP:../data/external/NexteraPE-PE.fa:2:30:10
        :param headcrop:
        :param quality:
        :param windowsize:
        :param minlen:
        :return:
        """
        mkdir(dst_dir)
        with tqdm(strains) as pbar:
            for strain in pbar:
                filenames = [
                    os.path.basename(x)
                    for x in glob(source_dir + "/" + strain + "*.gz")
                ]
                FastQ.trim_pairs(filenames[0], filenames[1], source_dir,
                                 dst_dir, clip, headcrop, quality, windowsize,
                                 minlen)
Example #3
0
def update_proteins(annotation_dir,
                    proteome,
                    seq_col_name,
                    tax_id,
                    identity=0.9,
                    cpus=multiprocessing.cpu_count(),
                    db_init=None):
    print seq_col_name
    if db_init:
        from SNDG.Sequence.ProteinAnnotator import PABase
        PABase.sqldb.initialize(db_init)
    mkdir(annotation_dir)
    out = annotation_dir + "/species_blast.tbl"

    tax = Tax.select().where(Tax.ncbi_taxon_id == tax_id).get()
    species_tax = None
    for tax in Tax.parents(tax):
        if tax.node_rank == "genus":
            species_tax = tax
            break
    tax_data = "/data/xomeq/tax/"
    species_fasta = tax_data + str(int(species_tax.ncbi_taxon_id)) + ".fasta"

    if not os.path.exists(out):

        if not os.path.exists(species_fasta):
            Uniprot.download_proteome_from_tax(str(species_tax.ncbi_taxon_id),
                                               tax_data)

        cmd = "blastp -query %s  -db %s -evalue 0.00001 -outfmt 6  -max_hsps 1 -qcov_hsp_perc 0.9 -num_threads %i -out %s"
        execute(cmd % (proteome, species_fasta, cpus, out))
    species_desc = {
        x.id.split("|")[1]: " ".join(x.description.split()[1:])
        for x in bpio.parse(species_fasta, "fasta")
    }

    total = Protein.objects(organism=seq_col_name).count()
    with tqdm(bpsio.parse(out, "blast-tab"), total=total) as pbar:
        for query in pbar:
            pbar.set_description(query.id)
            if query[0][0].ident_pct > identity:

                unip = query[0].id.split(
                    "|")[1] if "|" in query[0].id else query[0].id
                dbxrefs = [
                    x.db + "||" + x.value
                    for x in Mapping.select().where(Mapping.uniprot == unip)
                ]
                p = Protein.objects(gene=query.id,
                                    organism=seq_col_name).no_cache().get()

                if not p.description and unip in species_desc:
                    p.description = species_desc[unip].split(
                        "OS=")[0] + " | homology with: " + unip
                    p.save()

                if dbxrefs:
                    p = SearchLoader.update_protein_with_dbxref(
                        query.id, dbxrefs, seq_col_name)
                    p.save()
Example #4
0
    def alignment(wd,
                  ref,
                  trimmed_1="trimmed_1.fastq",
                  trimmed_2="trimmed_2.fastq",
                  cpus=multiprocessing.cpu_count(),
                  strain="sample1",
                  species=None,
                  force=False,
                  read_group="group1"):
        if not species:
            species = strain

        mkdir(wd)

        wd = os.path.abspath(wd) + "/"
        ref = os.path.abspath(ref)

        assert os.path.exists(wd), f'{wd} could not be created'
        assert os.path.exists(ref), f'{ref} does not exist'
        assert os.path.exists(trimmed_1), f'{trimmed_1} does not exist'
        assert os.path.exists(trimmed_2), f'{trimmed_2} does not exist'

        # Generate a SAM file containing aligned reads
        if force or not os.path.exists(f"{wd}mapped_reads_raw.bam"):
            tab = "\\t"
            e(f"bwa mem -t {cpus} -M -R \'@RG{tab}ID:{read_group}{tab}SM:{strain}{tab}PL:illumina{tab}LB:{species}\' {ref} {trimmed_1} {trimmed_2} > {wd}aligned_reads.sam"
              )
        assert os.path.getsize(f"{wd}aligned_reads.sam"
                               ) > 10, f"{wd}aligned_reads.sam cant be empty"
        # Filter mapped reads and convert to BAM
        if force or (not os.path.exists(f"{wd}dedup.bam ")
                     and not os.path.exists(f"{wd}mapped_reads_raw.bam")):
            e(f"samtools view -@ {cpus} -F 4 -S -b -h {wd}aligned_reads.sam | samtools sort - > {wd}mapped_reads_raw.bam"
              )
            e(f"samtools view -@ {cpus} -f 4 -S -b -h {wd}aligned_reads.sam > {wd}unmapped_reads.bam"
              )
            e(f"bedtools bamtofastq -i unmapped_reads.bam -fq {wd}unmapped_1.fastq -fq2 {wd}unmapped_2.fastq"
              )
        if os.path.exists(f"{wd}unmapped_reads.bam"):
            os.remove(f"{wd}unmapped_reads.bam")

        if os.path.exists(f"{wd}aligned_reads.sam"):
            os.remove(f"{wd}aligned_reads.sam")

        # Sort and mark duplicates
        e(f"gatk MarkDuplicates -INPUT {wd}mapped_reads_raw.bam -OUTPUT {wd}dedup.bam -METRICS_FILE {wd}metrics.txt"
          )
        assert os.path.getsize(
            f"{wd}dedup.bam") > 10, f"{wd}dedup.bam cant be empty"

        os.remove(f"{wd}mapped_reads_raw.bam")
        e(f'samtools sort {wd}dedup.bam > {wd}mapped_reads.bam')
        os.remove(f"{wd}dedup.bam")
        e(f'samtools index {wd}mapped_reads.bam')
        e(f"gatk CollectInsertSizeMetrics --I {wd}mapped_reads.bam --O {wd}insert_size_metrics.txt --H {wd}insert_size_histogram.pdf --M 0.5"
          )

        return f'{wd}mapped_reads.bam'
Example #5
0
    def assemble_pe(r1: str,
                    r2: str,
                    out: str,
                    name: str,
                    ss: str = None,
                    trusted_contigs: str = None,
                    untrusted_contigs: str = None,
                    cov_cutoff: int = 5,
                    tmp_dir: str = "/tmp/",
                    cpus=multiprocessing.cpu_count()):
        """

        :param out: output dir
        :param trusted_contigs: trusted contigs path
        :param untrusted_contigs: untrusted contigs path
        :param cov_cutoff:
        :return:
        """
        if tmp_dir == "/tmp/":
            tmp_dir = tmp_dir + name

        workdir1 = tmp_dir
        workdir2 = os.path.dirname(r1)
        assert workdir2 == os.path.dirname(
            r2), "r1 and r2 must be in the same directory"
        mkdir(out)

        mappings = f" -v {workdir1}:/out "
        in_dir = "/out/"
        if workdir1 != workdir2:
            mappings = mappings + f" -v {workdir2}:/in "
            in_dir = "/in/"

        template = """docker run  -u $(id -u):$(id -g) --rm -w /out {mappings} {image} spades.py \
                        {libs} {tcont} {utcont} -t {cpus} --isolate --cov-cutoff {cov_cutoff} -o /out """
        libs = ""

        i = 1
        r1_img = in_dir + r1.split(workdir2)[1]
        r2_img = in_dir + r2.split(workdir2)[1]
        libs += f' --pe{i}-1 "{r1_img}" --pe{i}-2 "{r2_img}" '
        if ss:
            ss_img = in_dir + ss.split(workdir2)[1]
            libs += f' --pe{i}-s "{ss_img}" '

        tcont = " --trusted-contigs " + trusted_contigs if trusted_contigs else ""
        utcont = " --untrusted-contigs " + untrusted_contigs if untrusted_contigs else ""
        cmd = template.format(libs=libs,
                              tcont=tcont,
                              utcont=utcont,
                              cov_cutoff=cov_cutoff,
                              out=out,
                              mappings=mappings,
                              image=Assembly.SPADES_DOCKER_IMAGE,
                              cpus=cpus)
        print(cmd)
Example #6
0
 def fastqc(source_dir, dst_dir):
     mkdir(dst_dir)
     for filename in tqdm(
             sorted(
                 glob(source_dir + "/*.fastq") +
                 glob(source_dir + "/*.fastq.gz") +
                 glob(source_dir + "/*.fq") +
                 glob(source_dir + "/*.fq.gz"))):
         execute("fastqc  {src} -q --extract -o {dst}",
                 src=filename,
                 dst=dst_dir)
Example #7
0
 def update_pdb(self, pdb):
     pdb = pdb.lower()
     mkdir(self.pdbs_dir + pdb[1:3])
     if os.path.exists(self.pdb_path_gzipped(pdb)):
         execute("gunzip " + self.pdb_path_gzipped(pdb))
     elif not os.path.exists(self.pdb_path(pdb)):
         download_file(
             self.url_pdb_files + pdb[1:3] + "/pdb" + pdb +
             self.pdb_download_extention, self.pdbs_dir + pdb[1:3] +
             "/pdb" + pdb + self.pdb_download_extention)
         execute("gunzip " + self.pdb_path_gzipped(pdb))
Example #8
0
 def prepare_dir(directory,
                 df,
                 mfilter=lambda df: df[
                     (df.zqmean >= -2) & (df.zqmean <= 2)],
                 csv_name="models.csv"):
     df = mfilter(df)
     mkdir(directory)
     df.to_csv(directory + "/" + csv_name,
               index=False,
               columns=Modelome.columns)
     for _, r in df.iterrows():
         shutil.copy(r.path, directory + "/" + r.model + ".pdb")
Example #9
0
def complete_pockets(pdb, strdoc, structure, pdbUtils):
    pdb_file = pdbUtils.pdb_path(pdb)
    pockets_json = pdbUtils.pdb_pockets_path(pdb)
    mkdir(os.path.dirname(pockets_json))

    if not os.path.exists(pockets_json) or os.path.getsize(pockets_json) < 10:
        r = FPocket(pdb_file).hunt_pockets()
        r.save(pockets_json)
        r.delete_dir()

    if os.path.exists(pockets_json):
        strdoc.pockets = StructureAnotator.pocket_residue_set(
            pockets_json, structure.get_atoms())
Example #10
0
 def update_pdb(self, pdb):
     pdb = pdb.lower()
     mkdir(self.pdbs_dir + pdb[1:3])
     if not os.path.exists(self.pdb_path(pdb)) or (os.path.getsize(self.pdb_path(pdb)) < 100):
         if os.path.exists(self.pdb_path_gzipped(pdb)) and (os.path.getsize(self.pdb_path_gzipped(pdb)) > 100):
             execute("gunzip " + self.pdb_path_gzipped(pdb))
             if os.path.exists(self.pdb_path_gzipped(pdb)) and not os.path.exists(self.pdb_path(pdb)):
                 os.remove(self.pdb_path_gzipped(pdb))
         elif not os.path.exists(self.pdb_path(pdb)):
             download_file(self.url_pdb_files + pdb[1:3] + "/pdb" + pdb + self.pdb_download_extention,
                           self.pdbs_dir + pdb[1:3] + "/pdb" + pdb + self.pdb_download_extention, ovewrite=True)
             execute("gunzip " + self.pdb_path_gzipped(pdb))
     return self.pdb_path(pdb)
Example #11
0
    def load_pdb_pocket(self, pdb, pdb_dir="/data/databases/pdb/"):
        utils = PDBs(pdb_dir)
        if not os.path.exists(utils.pdb_pockets_path(pdb)):
            utils.update_pdb(pdb)
            fpocket = FPocket(utils.pdb_path(pdb))
            result = fpocket.hunt_pockets()
            mkdir(os.path.dirname(utils.pdb_pockets_path(pdb)))
            result.save(utils.pdb_pockets_path(pdb))
        with open(utils.pdb_pockets_path(pdb)) as h:
            result = json.load(h)

        self.pdb_data[pdb]["pockets"] = result
        return self.pdb_data[pdb]["pockets"]
Example #12
0
    def handle(self, *args, **options):
        input_file = options['input']
        accession = options['accession']

        self.stderr.write(f"trying to import  {options['accession']} imported!")

        if not os.path.exists(input_file):
            raise CommandError(f'{input_file} does not exists')

        extra_attrs = {}
        taxon = self.detect_tax(input_file, extra_attrs)
        taxon = options['taxon'] if options['taxon'] else taxon
        description = options["description"] if options["description"] else " ".join(
            [f'{k}:{v}' for k, v in extra_attrs.items()])

        io = BioIO(accession, taxon, stderr=self.stderr)

        if options["force"]:
            if io.exists():
                res = io.delete()
                self.stderr.write(str(res))
            elif io.exists():
                raise CommandError(f'{accession} already exists, use --force to overwrite ')

        grep_cmd = 'grep -c "FEATURES *Location/Qualifiers" "%s"' % input_file
        if input_file.endswith(".gz"):
            grep_cmd = 'z' + grep_cmd
        total = int(sp.check_output(grep_cmd, shell=True))
        io.create_db(description)

        seqstore = SeqStore.instance()

        if options['seqs']:
            if not os.path.exists(options['seqs']):
                raise CommandError(f'{options["seqs"]} does not exists')

            it = smart_parse(input_file, smart_parse(options["seqs"]))
        else:
            it = smart_parse(input_file)

        mkdir(seqstore.db_path(accession))
        s1 = seqstore.stream(seqstore.genome_db_path(accession), force=True,
                             stderr=sys.stderr, stdout=sys.stderr)
        s2 = seqstore.stream(seqstore.proteome_db_path(accession), force=True,
                             stderr=sys.stderr, stdout=sys.stderr)
        with s1 as genome_stream, s2 as proteome_stream:
            io.process_record_list(it, total, genome_stream, proteome_stream)

        self.stderr.write(f"genome {options['accession']} imported!")
Example #13
0
    def create_human_microbiome(dst="/data/databases/human/", update=False):

        dst_accs = dst + "gut_microbiota_assemblies/"
        mkdir(dst_accs)
        final_file = dst + Offtarget.DEFAULT_GUT_FILENAME

        utils = GenebankUtils()
        with gzip.open(final_file, "wt") as h:
            for accession in tqdm(gut_microbiote_assemblies, file=sys.stderr):
                genome_path = dst_accs + accession + ".genomic.gbff.gz"
                if update or not os.path.exists(genome_path):
                    genome_path = NCBI.download_assembly(accession, dst_accs)
                utils.proteins(genome_path, h)

        return final_file
Example #14
0
    def genotype_call(reference, vcf, output_file="./combined.vcf", ploidy=2):
        wd = os.path.dirname(os.path.abspath(output_file)) + "/"
        reference = os.path.abspath(reference)
        vcf = os.path.abspath(vcf)

        mkdir(wd)

        assert os.path.exists(wd), f'{wd} could not be created'
        assert os.path.exists(reference), f'{reference} does not exist'
        assert os.path.exists(vcf), f'{vcf} does not exist'

        e(f"""gatk GenotypeGVCFs \
        -R "{reference}" -ploidy {ploidy} \
        -V "{vcf}" \
        -O "{output_file}" 
        """)
        return
Example #15
0
 def offtarget(organism,
               offtarget_databases,
               offtarget_names,
               tmp_dir=None):
     if not tmp_dir:
         tmp_dir = "/data/organismos/" + organism + "/annotation/"
     mkdir(tmp_dir)
     proteins = tmp_dir + "proteins.fasta"
     if not os.path.exists(proteins):
         BioMongoDB.protein_fasta(proteins, organism)
     results = Offtarget.offtargets(proteins, tmp_dir, offtarget_databases)
     for i, name in enumerate(offtarget_names):
         load_blast_features(organism,
                             results[i],
                             name,
                             min_identity=0.4,
                             min_query_coverage=0.4,
                             min_hit_coverage=0.4)
def process_domain(domains_dir, chain, dn_start, dn_end, pdb_model):
    mkdir(domains_dir)

    cs.filter = SelectResidues(chain.id, {
        y: 1
        for y in [x.id[1] for x in chain.get_residues()][dn_start:dn_end]
    })
    domain_pdb_path = cs.make_pdb(pdb_path, code, chain.id, overwrite=True)
    res = FPocket(domain_pdb_path, domains_dir).hunt_pockets()
    for pocket in res.pockets:
        rs = ResidueSet(name="DomainPocket%i" % pocket.pocket_num,
                        pdb=pdb_model)
        rs.save()
        for k, v in pocket.properties.items():
            ResidueSetProperty(residue_set=rs, name=k, value=v).save()
    res.delete_dir()

    qm = QMean.assesment(domain_pdb_path)
    residues_qm = qm["residues"]
    del qm["residues"]
    for k, v in qm.items():
        ChainProperty(pdb=pdb_model, chain=chain.id, name=k, value=v).save()
Example #17
0
    def variant_call(wd, reference, alignment, ploidy=2):
        wd = os.path.abspath(wd) + "/"
        reference = os.path.abspath(reference)
        alignment = os.path.abspath(alignment)

        mkdir(wd)

        assert os.path.exists(wd), f'{wd} could not be created'
        assert os.path.exists(reference), f'{reference} does not exist'
        assert os.path.exists(alignment), f'{alignment} does not exist'

        e(f"""gatk HaplotypeCaller -ERC GVCF \
         -R "{reference}" -ploidy {ploidy} \
         -I "{alignment}" --output-mode EMIT_ALL_CONFIDENT_SITES \
         -O "{wd}raw.g.vcf.gz"
        """)

        e(f"""gatk GenotypeGVCFs \
        -R "{reference}" -ploidy {ploidy} \
        -V "{wd}raw.g.vcf.gz" \
        -O "{wd}output.vcf.gz" 
        """)
Example #18
0
from SNDG import mkdir, execute, execute_from, init_log
from SNDG.WebServices import download_file
from SNDG.Structure.PDBs import PDBs

init_log("/tmp/createdb.log")


def old_or_inexistent(filepath, period=30):
    return not os.path.exists(filepath) or ((
        (time.time() - os.path.getatime(filepath)) / 60 / 60 / 24) > period)


#os.environ["http_proxy"] = "http://proxy.fcen.uba.ar:8080"
#os.environ["ftp_proxy"] = "http://proxy.fcen.uba.ar:8080"

mkdir("/data/pdb/")
download_file("ftp://ftp.wwpdb.org/pub/pdb/derived_data/index/entries.idx",
              "/data/pdb/entries.idx",
              ovewrite=True)

pdbs = PDBs("/data/pdb/")
pdbs.download_pdb_seq_ses()
pdbs.update_pdb_dir()
mkdir("/data/pdb/processed/")
pdbs.pdbs_seq_for_modelling()
execute("makeblastdb -dbtype prot -in /data/pdb/processed/seqs_from_pdb.fasta")

if old_or_inexistent("/data/uniprot/uniref/uniref90/uniref90.fasta"):
    mkdir("/data/uniprot/uniref/uniref90")
    download_file(
        "ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz",
Example #19
0
                          '--annotation',
                          action='store',
                          dest='annotation',
                          required=True)
    required.add_argument('-S',
                          '--strain',
                          action='store',
                          dest='strain',
                          default="sample")
    required.add_argument('-R1',
                          '--read1',
                          action='store',
                          dest='read1',
                          required=True)
    required.add_argument('-R2',
                          '--read2',
                          action='store',
                          dest='read2',
                          required=True)
    # parser.add_argument('--useSingletons', action = 'store_true', dest = 'singletons')

    args = parser.parse_args()

    mkdir(args.work_dir)
    Mapping.clean_reads(args.work_dir, args.read1, args.read2)
    alignment_path = Mapping.alignment(args.work_dir,
                                       args.reference,
                                       strain=args.strain)
    Mapping.variant_call(args.work_dir, args.reference, alignment_path,
                         args.strain)
Example #20
0
def from_TriTrypDB(name, gff, fasta, tax, tmp_dir=None):
    genome = {x.id: x for x in sp(fasta)}
    from BCBio import GFF
    import re
    annotation = list(GFF.parse(gff, base_dict=genome))
    contig = annotation[0]

    seqCol = BioDocFactory.create_genome(name, contig, tax, Tax)
    seqCol.save()

    if not tmp_dir:
        tmp_dir = "/tmp/" + name + "/"
    mkdir(tmp_dir)
    gene_ids = {}
    with tqdm(annotation) as pbar:
        for contig in pbar:
            pbar.set_description(contig.id)
            if len(contig.seq) > 15000000:
                contig.seq = ""
            contigDoc, gene_ids2 = BioDocFactory.create_contig(
                contig,
                seqCol,
                type_map={
                    "rRNA": "rRNA",
                    "ncRNA": "ncRNA",
                    NCBI.f_mRNA: "gene",
                    "exon": "exon",
                    "gene": "gene",
                    NCBI.f_CDS: NCBI.f_CDS,
                    "rRNA": "rRNA",
                    "tRNA": "tRNA",
                    "tmRNA": "tmRNA",
                    "snoRNA": "snoRNA",
                    "three_prime_UTR": "three_prime_UTR",
                    "five_prime_UTR": "five_prime_UTR"
                })
            gene_ids.update(gene_ids2)
            contigDoc.save()
    prots = []
    with tqdm(tritryp_protein_iter(annotation)) as pbar:
        for (protein, cds_f) in pbar:

            protDoc = Protein(seq=str(protein.seq), name=protein.id)

            if "description" in cds_f.qualifiers:
                protein_description = cds_f.qualifiers['description'][0]
            elif "Note" in cds_f.qualifiers:
                protein_description = cds_f.qualifiers['Note'][0]
            elif "product" in cds_f.qualifiers:
                protein_description = cds_f.qualifiers['product'][0]
            else:
                protein_description = ""

            protDoc.description = protein_description

            gos = []
            if "Ontology_term" in cds_f.qualifiers:
                gos = [
                    x.lower() for x in cds_f.qualifiers["Ontology_term"]
                    if "GO:" in x and (
                        x not in ["GO:0008150", "GO:0003674", "GO:0005575"])
                ]

            note = cds_f.qualifiers["Note"][0].split(
                " ")[0] if "Note" in cds_f.qualifiers else ""
            ecs = ["ec:" + note] if re.match(
                '^[0-9]+\.[0-9\-]+\.[0-9\-]+\.[0-9\-]$', note) else []
            ontologies = list(set(ecs + gos))

            protDoc.gene = [protein.id]
            protDoc.ontologies = ontologies
            protDoc.alias = [protein.id]

            if len(protDoc.seq) > 30000:
                raise Exception("No existen proteinas tan largas...")
            protDoc.gene_id = gene_ids[protein.id]
            protDoc.organism = name
            protDoc.auth = str(BioMongoDB.demo_id)
            protDoc.seq_collection_id = seqCol
            prots.append(protDoc)
            if pbar.n and ((pbar.n % 1000) == 0):
                Protein.objects.insert(prots)
                prots = []
    if prots:
        Protein.objects.insert(prots)

    _common_annotations(name, tmp_dir)
Example #21
0
def from_ref_seq(
        name,
        ann_path,
        seqs=None,
        tax=None,
        tmp_dir=None,
        extract_annotation_feature=lambda feature: feature.sub_features[0]
    if feature.type == "gene" and hasattr(feature, "sub_features") and len(
        feature.sub_features) else feature,
        accept_protein_feature=lambda f: (
            (f.type == "CDS") and ("translation" in f.qualifiers)),
        extract_sequence=lambda c, f: f.qualifiers["translation"][0]
    if "translation" in f.qualifiers else f.extract(c).seq.translate(),
        cpus=1):
    if seqs:
        seqs = {r.id: r.seq for r in bpio.parse(seqs, "fasta")}

    iter_seqs = list(sp(ann_path, seqs=seqs) if seqs else sp(ann_path))
    for contig in iter_seqs:
        if has_tax:
            seqCol = BioDocFactory.create_genome(name, contig, tax, Tax)
        else:
            seqCol = BioDocFactory.create_genome(name, contig)
        seqCol.save()
        break
    if not tmp_dir:
        tmp_dir = "/tmp/" + name + "/"
    mkdir(tmp_dir)
    gene_ids = {}
    with tqdm(iter_seqs) as pbar:
        for contig in pbar:
            pbar.set_description(contig.id)
            if len(contig.seq) > 15000000:
                contig.seq = ""
            contigDoc, gene_ids2 = BioDocFactory.create_contig(
                contig,
                seqCol,
                type_map={
                    "rRNA": "rRNA",
                    "ncRNA": "ncRNA",
                    NCBI.f_mRNA: NCBI.f_mRNA,
                    "gene": "gene",
                    NCBI.f_CDS: NCBI.f_CDS,
                    "rRNA": "rRNA",
                    "tRNA": "tRNA",
                    "tmRNA": "tmRNA"
                },
                extract_annotation_feature=extract_annotation_feature,
            )
            gene_ids.update(gene_ids2)
            contigDoc.save()

    prots = []

    with tqdm(
            _protein_iter(
                iter_seqs,
                accept_feature=accept_protein_feature,
                extract_annotation_feature=extract_annotation_feature,
                extract_sequence=extract_sequence)) as pbar:
        for (protein, cds_f) in pbar:
            if "locus_tag" in cds_f.qualifiers:
                protDoc = BioDocFactory.create_protein(protein, cds_f)
                if len(protDoc.seq) > 30000:
                    raise Exception("No existen proteinas tan largas...")
                if protDoc.seq.count("*") > 1:
                    print(
                        f"{cds_f.qualifiers['locus_tag'][0]}: Too many stop codons!"
                    )
                    continue
                if protDoc.seq.count("+") > 1:
                    print(
                        f"{cds_f.qualifiers['locus_tag'][0]}: + signs found...!"
                    )
                    continue
                protDoc.gene_id = gene_ids[cds_f.qualifiers["locus_tag"][0]]
                protDoc.organism = name
                protDoc.auth = str(BioMongoDB.demo_id)
                protDoc.seq_collection_id = seqCol
                for f in protein.features:
                    protDoc.features.append(
                        Feature(identifier=f.qualifiers["Ontology_term"][0],
                                type=f.type,
                                location=Location(start=int(f.location.start),
                                                  end=int(f.location.end))))

                prots.append(protDoc)
                if pbar.n and ((pbar.n % 1000) == 0):
                    Protein.objects.insert(prots)
                    prots = []
    if prots:
        Protein.objects.insert(prots)

    # _common_annotations(name, tmp_dir, cpu=cpus)
    return seqCol
Example #22
0
    search.add_argument('--alns_dir',
                        default=None,
                        help='save blast aligments in this folder')
    search.add_argument("-d",
                        '--database',
                        required=True,
                        help='db to be searched with the pssm/s')
    search.add_argument('--cpu', default=4, type=int, help='cpus to use')
    search.add_argument('--format',
                        choices=["table", "fasta"],
                        default="table")

    args = parser.parse_args()

    if args.command == "pssm":
        mkdir(args.output)
        assert os.path.exists(
            args.output), f"{args.output} could not be created"
        for record in bpio.parse(args.seqs, "fasta"):

            pssm_file = f'{args.output}/{record.id}.pssm'
            query_file = mktemp()
            bpio.write(record, query_file, "fasta")
            if (os.path.exists(pssm_file)
                    and (os.path.getsize(pssm_file) > 100)):
                print(pssm_file)
            else:
                PsiProfile.build_profile(query_file, args.database,
                                         args.iterations, pssm_file, args.cpu)
                if (not os.path.exists(pssm_file)) or (
                        os.path.getsize(pssm_file) < 100):
Example #23
0
        for x in assessment[0].all_scores:
            result[x.name + "_norm"] = x.norm
            result[x.name + "_zscore"] = x.z_score
        result["residues"] = {}
        for row in assessment[1].score_table.rows:
            r = {f: row[i] for i, f in enumerate(assessment[1].score_table.col_names[4:], 4)}
            result["residues"][row[0] + "_" + str(row[2]) + "_" + str(row[3])] = r
        return result


if __name__ == '__main__':
    from SNDG import init_log,arg_file_iter

    init_log()

    parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter)
    parser.add_argument("-acc", "--accpro", default="/opt/sspro4/bin/predict_acc.sh")
    parser.add_argument("-psi", "--psipred", default="/opt/psipred/runpsipred")
    parser.add_argument("-i", "--inputpdb", default="-",)
    parser.add_argument("-o", "--outdir", default="./")
    parser.add_argument( "--cpus", default=multiprocessing.cpu_count())

    args = parser.parse_args()

    # "/data/databases/pdb/divided/ok/pdb4oke.ent"
    mkdir(args.outdir)
    assessment = QMean.assesment(args.inputpdb, output_dir=args.outdir,
                                 accpro_path=args.accpro, psipred_path=args.psipred,cpus=args.cpus)

    print(assessment)
    tax_db.initialize(MySQLDatabase('bioseqdb', user='******', passwd="mito"))
    mysql_db.initialize(MySQLDatabase('sndg', user='******', passwd="mito"))
    assemblies = list(ExternalAssembly.select().where(
        ExternalAssembly.sample_source.is_null(False)))

    ProteinAnnotator.connect_to_db(database="unipmap",
                                   user="******",
                                   password="******")
    with tqdm(assemblies) as pbar:
        for x in pbar:
            if mdb.seq_col_exists(x.assembly_accession):
                continue
            pbar.set_description(x.assembly_accession)
            try:
                dst_dir = "/data/organismos/" + x.assembly_accession + "/annotation/"
                mkdir(dst_dir)
                gbpath = x.download_gbk(dst_dir)
                from_ref_seq(x.assembly_accession,
                             gbpath,
                             tax=x.ncbi_tax,
                             tmp_dir=dst_dir)

                tid = int(
                    mdb.db.sequence_collection.find_one(
                        {"name": x.assembly_accession})["tax"]["tid"])
                tmp_dir = "/data/organismos/" + x.assembly_accession + "/annotation/"
                proteome_dir = "/data/organismos/" + x.assembly_accession + "/contigs/"
                mkdir(tmp_dir)
                mkdir(proteome_dir)
                protein_fasta = create_proteome(proteome_dir,
                                                x.assembly_accession)
Example #25
0
            "uploader": "demo",
            "_class": "ar.com.bia.entity.SeqCollectionDoc",
            "type": "value",
            "options": ["No", "Yes"],
            "description": "Has a hit in Database of Essential Genes"
        })
}
from SNDG.Sequence import read_blast_table
from tqdm import tqdm

# cols = list(SeqCollection.objects(name__nin=["cruzi","pdb"]))
cols = list(SeqCollection.objects(name__nin=["cruzi", "pdb"]))
cpus = 4
db = mdb.db
for seqCol in tqdm(cols):
    mkdir("/data/organismos/" + seqCol.name + "/contigs")
    proteome = "/data/organismos/" + seqCol.name + "/contigs/genoma.fasta"
    if not os.path.exists(proteome):
        mdb.protein_fasta(proteome, seqCol.name)

    out = "/data/organismos/" + seqCol.name + "/annotation/offtarget/"
    mkdir(out)

    if not seqCol.has_druggability_param("human_offtarget"):

        seqCol.druggabilityParams.append(off_props["human_offtarget"])
        db = "/data/databases/human/gencode.v17.pc_translations.fa"

        execute(
            "blastp -evalue 1e-5 -max_hsps 1 -outfmt 6 -max_target_seqs 1 -db {db} -query {query} -out {out} -num_threads {cpus}",
            db=db,
Example #26
0
    required.add_argument('-t', '--tmp_dir', default="/tmp")
    required.add_argument("--cpus", default=1)

    args = parser.parse_args()

    if not os.path.exists(args.ref_dirs):
        raise FileNotFoundError(f"{args.ref_dirs} does not exists")
    if args.in_fasta_s and not os.path.exists(args.in_fasta_s):
        raise FileNotFoundError(f"{args.in_fasta_s} does not exists")
    if not os.path.exists(args.in_fasta_r2):
        raise FileNotFoundError(f"{args.in_fasta_r2} does not exists")
    if not os.path.exists(args.in_fasta_r1):
        raise FileNotFoundError(f"{args.in_fasta_r1} does not exists")

    refs = glob(f'{args.ref_dirs}/*.fasta') + glob(f'{args.ref_dirs}/*.fna')

    if not refs:
        raise FileNotFoundError(f"no references detected in {args.ref_dirs}")

    mkdir(args.output)
    if not os.path.exists(args.output):
        raise FileNotFoundError(f"could not create {args.output}")
    
    if not args.output.endswith("/"):
        args.output = args.output + "/"
    
    mrca = MultiRefCoreAln(base_refs=refs, cpus=args.cpus)
    mrca.core_reads(args.output, args.sample_name,
                    args.in_fasta_r1, args.in_fasta_r2, args.in_fasta_s, args.tmp_dir)

Example #27
0
mysqldb = ProteinAnnotator.connect_to_db(database="unipmap",
                                         user="******",
                                         password="******")

orgs = [
    ("Mpylori26695", "Helicobacter pylori 26695 (e-proteobacteria)",
     "/data/organismos/Mpylori26695/GCF_000008525.1_ASM852v1_genomic.gbff",
     85962),
    ("MpyloriIndia", "Helicobacter pylori India7 (e-proteobacteria)",
     "/data/organismos/MpyloriIndia/GCF_000185185.1_ASM18518v1_genomic.gbff",
     907238),
]

for name, org, ann_path, tax in orgs:
    organism = name
    mkdir("/data/organismos/" + name + "/annotation/offtarget")
    mkdir("/data/organismos/" + name + "/annotation/pwtools")
    mkdir("/data/organismos/" + name + "/annotation/pathways")
    mkdir("/data/organismos/" + name + "/estructura/raw")
    mkdir("/data/organismos/" + name + "/estructura/sndg/modelos")
    mkdir("/data/organismos/" + name + "/estructura/sndg/pockets")

    from_ref_seq(name, ann_path, tax=tax, cpus=3)
    mdb.protein_fasta("/data/organismos/" + name + "/annotation/proteins.faa",
                      name)
    update_proteins("/tmp/" + name + "/",
                    "/data/organismos/" + name + "/annotation/proteins.faa",
                    name,
                    1003200,
                    db_init=mysqldb)
    parser.add_argument("-u", "--dbuser", default="root")

    args = parser.parse_args()
    from peewee import MySQLDatabase

    mysql_db = MySQLDatabase(args.dbname,
                             user=args.dbuser,
                             password=args.dbpass)

    sqldb.initialize(mysql_db)

    pdb_utils = PDBs(pdb_dir=args.pdb_dir)
    props = {x.name: x for x in Property.select()}
    pdbs = list(pdb_utils)
    with tqdm(pdbs) as pbar:
        for (code, pdb_path) in pbar:

            pdb_model = PDB.select().where(PDB.code == code).first()

            p = PDBParser(PERMISSIVE=True, QUIET=True)
            try:
                for chain in p.get_structure(code, pdb_path).get_chains():
                    chains_dir = args.pdb_dir + "/chains/" + code[1:3] + "/"
                    mkdir(chains_dir)
                    cs = ChainSplitter(chains_dir)
                    process_chain(pdb_path, code, chain.id, pdb_model, props)

            except Exception as ex:
                traceback.print_stack()
                _log.error(code + ": " + str(ex))
Example #29
0
from SNDG.WebServices import download_file

init_log("/tmp/createdb.log")


def old_or_inexistent(filepath, period=30):
    return not os.path.exists(filepath) or (((time.time() - os.path.getatime(filepath)) / 60 / 60 / 24) > period)


os.environ["http_proxy"] = "http://proxy.fcen.uba.ar:8080"
os.environ["ftp_proxy"] = "http://proxy.fcen.uba.ar:8080"



if not os.path.exists("/data/cog/whog"):
    mkdir("/data/cog/")
    download_file("ftp://ftp.ncbi.nih.gov/pub/COG/COG/whog",
                  "/data/cog/whog")

if not os.path.exists("/data/cog/myva"):
    mkdir("/data/cog/")
    download_file("ftp://ftp.ncbi.nih.gov/pub/COG/COG/myva",
                  "/data/cog/myva")
    execute("formatdb -i /data/cog/myva -o T")

if not os.path.exists("/data/ec/PRIAM_MAR15/priam"):
    mkdir("/data/ec/")
    download_file("http://priam.prabi.fr/REL_MAR15/Distribution.zip",
                  "/data/ec/PRIAM_MAR15.zip")
    execute_from("unzip /data/ec/PRIAM_MAR15.zip; exit 0;", "/data/ec/",retcodes=[0,1])
Example #30
0
            )
        if args.databases in ["all", "human"]:
            path = f'{args.output}/human/'
            if args.force or not os.path.exists(
                    path + Offtarget.DEFAULT_HUMAN_FILENAME):
                path = Offtarget.download_human_prots(dst=path)
            else:
                sys.stderr.write(
                    f'{path} already exists, overwrite using --force')

            filename = os.path.basename(path)
            execute(
                f"zcat {path}{Offtarget.DEFAULT_HUMAN_FILENAME} | makeblastdb -title human -out {path}{Offtarget.DEFAULT_HUMAN_FILENAME} -dbtype prot -in -"
            )
        if args.databases in ["all", "deg"]:
            mkdir(f'{args.output}/deg/')
            Offtarget.download_deg(f'{args.output}/deg/')
    elif args.command == "gut_microbiote_blast":
        blast_gut_path = f'{args.output}/gut_microbiome.blast.tbl'
        gut_result_path = f'{args.output}/gut_microbiome.tbl'
        # if not os.path.exists(args.database + ".phr"):
        #     raise FileNotFoundError(f"{args.database} index files could not be found. Run makeblastdb")
        if args.force or not os.path.exists(blast_gut_path):
            Offtarget.offtargets(args.input_faa,
                                 blast_gut_path,
                                 offtarget_db=args.database,
                                 cpus=args.cpus)
        else:
            sys.stderr.write(
                f'{blast_gut_path} already exists, overwrite using --force')