Ejemplo n.º 1
0
    def trim_pairs(file1,
                   file2,
                   src_dir,
                   dst_dir,
                   clip="",
                   headcrop=13,
                   quality=20,
                   windowsize=4,
                   minlen=36,
                   crop=None):
        assert "TRIMOMMATIC" in os.environ, "environment variable not defined: TRIMOMMATIC"

        cmdcrop = ("CROP:" + str(crop)) if crop else ""

        cmd = """java -jar $TRIMOMMATIC PE \
    {src}/{r1} {src}/{r2}\
    {dst}/{r1} {dst}/{r1u} \
    {dst}/{r2} {dst}/{r2u} \
     {clip} {crop} HEADCROP:{headcrop} \
     LEADING:{quality} TRAILING:{quality} SLIDINGWINDOW:{windowsize}:20 MINLEN:{minlen}"""
        cmd = cmd.format(clip=clip,
                         quality=quality,
                         windowsize=windowsize,
                         minlen=minlen,
                         headcrop=headcrop,
                         dst=dst_dir,
                         src=src_dir,
                         r1=file1,
                         r2=file2,
                         r1u=file1.replace(".fastq.gz", "_unpaired.fastq.gz"),
                         r2u=file2.replace(".fastq.gz", "_unpaired.fastq.gz"),
                         crop=cmdcrop)

        execute(cmd)
Ejemplo n.º 2
0
    def psipred(fasta, path="./", cmd="/opt/psipred/runpsipred",cpus=multiprocessing.cpu_count()):
        """
        #runpsipred example.fasta --> example.horiz
        # PSIPRED HFORMAT (PSIPRED V3.5)

Conf: 928999937998289999999999961696258972566893341566778999832667
Pred: CEEEEEEECCCCCHHHHHHHHHHHHHHCCCEEEEEECCCCCCCCCCCCCCEEEEEECCCC
  AA: PKALIVYGSTTGNTEYTAETIARQLANAGYEVDSRDAASVEAGGLFEGFDLVLLGCSTWG
              10        20        30        40        50        60

        :param fasta:
        :return:
        """
        execute(cmd + " " + fasta + " " + str(cpus), wd=path)
        horiz = fasta.replace(".fasta", ".horiz")
        pred = ""
        conf = ""
        with open(horiz) as h:
            for x in h.readlines():
                if x.startswith("Pred:"):
                    pred += x.split(" ")[1].strip()
                if x.startswith("Conf:"):
                    conf += x.split(" ")[1].strip()

        return (pred, conf)
Ejemplo n.º 3
0
    def haplotype_call(bam_path,
                       output_gvcf,
                       ref_path,
                       ploidy=2,
                       only_cmd=False):
        bam_folder = os.path.dirname(bam_path)
        bam_file = os.path.basename(bam_path)

        ref_folder = os.path.dirname(ref_path)
        ref_file = os.path.basename(ref_path)
        out_folder = os.path.dirname(output_gvcf)
        out_file = os.path.basename(output_gvcf)

        docker_bam_folder = "/out/bam/"
        if bam_folder == out_folder:
            mount2 = ""
            docker_out_path = docker_bam_folder
        else:
            docker_out_path = "/out/out/"
            mount2 = f" -v {out_folder}:{docker_out_path} "

        cmd = f"""docker run --rm -w /out {mount2} -v {bam_folder}:/out/bam/ -v {ref_folder}:/out/ref/ broadinstitute/gatk:4.1.0.0 \
        java -jar /gatk/gatk-package-4.1.0.0-local.jar  HaplotypeCaller -ERC GVCF \
        -R /out/ref/{ref_file} -ploidy {ploidy} \
        -I /out/bam/{bam_file} --output-mode EMIT_ALL_SITES \
        -O {docker_out_path}/{out_file}"""
        if only_cmd:
            return cmd
        else:
            execute(cmd)
Ejemplo n.º 4
0
    def profile_search(database,
                       pssm_file,
                       search_result,
                       cpu=1,
                       evalue=0.00001):
        cmd = f"psiblast -db {database} -in_pssm {pssm_file} -num_threads {cpu} -evalue {evalue}  -outfmt 5 -out {search_result} 1>&2"
        execute(cmd)
        try:
            search_result = list(bpsio.parse(search_result, "blast-xml"))
        except ParseError:
            sys.stderr.write(
                f'PSIProfile: error parsing results from {search_result}')
            return None

        for query in search_result:
            for hit in list(query):
                for hsp in hit:
                    identity = 1.0 * hsp.ident_num / hsp.aln_span
                    data = [
                        hsp.query.id, hsp.query_start, hsp.query_end,
                        hsp.hit.id, hsp.hit_start, hsp.hit_end, hsp.evalue,
                        identity,
                        str(hsp.aln[0].seq),
                        str(hsp.aln[1].seq)
                    ]
                    yield {
                        f: data[i]
                        for i, f in enumerate(PsiProfile.search_result_fields)
                    }
Ejemplo n.º 5
0
def update_proteins(annotation_dir,
                    proteome,
                    seq_col_name,
                    tax_id,
                    identity=0.9,
                    cpus=multiprocessing.cpu_count(),
                    db_init=None):

    # if db_init:
    #     from SNDG.Sequence.ProteinAnnotator import PABase
    #     PABase.sqldb.initialize(db_init)
    # mkdir(annotation_dir)
    # out = annotation_dir + "/species_blast.tbl"
    #
    # tax = Tax.select().where(Tax.ncbi_taxon_id == tax_id).get()
    # species_tax = None
    # for tax in Tax.parents(tax):
    #     if tax.node_rank == "genus":
    #         species_tax = tax
    #         break
    # tax_data = "/data/xomeq/tax/"
    # species_fasta = tax_data + str(int(species_tax.ncbi_taxon_id)) + ".fasta"

    if not os.path.exists(out):

        if not os.path.exists(species_fasta):
            Uniprot.download_proteome_from_tax(str(species_tax.ncbi_taxon_id),
                                               tax_data)

        cmd = "blastp -query %s  -db %s -evalue 0.00001 -outfmt 6  -max_hsps 1 -qcov_hsp_perc 0.9 -num_threads %i -out %s"
        execute(cmd % (proteome, species_fasta, cpus, out))
    species_desc = {
        x.id.split("|")[1]: " ".join(x.description.split()[1:])
        for x in bpio.parse(species_fasta, "fasta")
    }

    total = Protein.objects(organism=seq_col_name).count()
    with tqdm(bpsio.parse(out, "blast-tab"), total=total) as pbar:
        for query in pbar:
            pbar.set_description(query.id)
            if query[0][0].ident_pct > identity:

                unip = query[0].id.split(
                    "|")[1] if "|" in query[0].id else query[0].id
                dbxrefs = [
                    x.db + "||" + x.value
                    for x in Mapping.select().where(Mapping.uniprot == unip)
                ]
                p = Protein.objects(gene=query.id,
                                    organism=seq_col_name).no_cache().get()

                if not p.description and unip in species_desc:
                    p.description = species_desc[unip].split(
                        "OS=")[0] + " | homology with: " + unip
                    p.save()

                if dbxrefs:
                    p = SearchLoader.update_protein_with_dbxref(
                        query.id, dbxrefs, seq_col_name)
                    p.save()
Ejemplo n.º 6
0
    def run_dssp(self):
        out = tempfile.mkstemp(suffix=".dssp")[1]
        execute("dssp -i {pdb_path} -o {out}", pdb_path=self.pdb_path, out=out)

        with open(out) as h:
            start = False
            for l in h:
                if start:
                    res = int(l[5:10])
                    aa = l[10:14].strip()
                    ss = l[14:17].strip()
                    bbl1 = l[23:24]
                    bbl2 = l[24:25]
                    bp1 = int(l[25:29])
                    bp2 = int(l[29:33])
                    bslabel = l[33:34]
                    self.dssp.append(
                        Struct(res=res,
                               aa=aa,
                               ss=ss,
                               bp1=bp1,
                               bp2=bp2,
                               bbl1=bbl1,
                               bbl2=bbl2,
                               bslabel=bslabel))
                else:
                    if l.startswith("  #  RESIDUE AA"):
                        start = True
Ejemplo n.º 7
0
 def offtargets(proteome,
                dst_resutls,
                offtarget_db,
                cpus=multiprocessing.cpu_count()):
     cmd = f"blastp -evalue 1e-5 -max_hsps 1 -outfmt 6  -db {offtarget_db} -query {proteome} -out {dst_resutls} -num_threads {cpus}|awk '$3>50'"
     execute(cmd)
     return dst_resutls
Ejemplo n.º 8
0
 def profile_search(seq_id, database, pssm_file, search_result, cpu):
     execute(
         "psiblast -db {database} -in_pssm {input} -num_threads {cpu}  -evalue 0.001  -outfmt 5 -out {output} > {cmd_out} ",
         output=search_result,
         database=database,
         input=pssm_file,
         cpu=cpu,
         cmd_out=search_result + ".out")
Ejemplo n.º 9
0
 def build_profile(seq_fasta,
                   database,
                   iterations,
                   pssm_file,
                   cpu,
                   evalue=0.0001):
     cmd = f"psiblast -query {seq_fasta} -db {database} -num_threads {cpu} -out_pssm {pssm_file} -evalue {evalue} -num_iterations {iterations} 1>&2 2>/dev/null"
     execute(cmd)
Ejemplo n.º 10
0
 def offtargets(proteome,
                dst_resutls,
                offtarget_db,
                cpus=multiprocessing.cpu_count(),
                min_identity=50):
     cmd = f"diamond blastp --evalue 1e-5 --max-hsps 1 --outfmt 6 --max-target-seqs 10000  --db {offtarget_db} --query {proteome} --threads {cpus}|awk '$3>{min_identity}' > {dst_resutls}"
     execute(cmd)
     return dst_resutls
Ejemplo n.º 11
0
    def blast_para_anotar(self, data_dir, fasta_query, fasta_db):

        execute("makeblastdb -in %s -dbtype prot" % (data_dir + fasta_db))

        blast_result = fasta_db.replace(".fasta", "_blast.xml")

        cmd = "blastp -query %s  -db %s -evalue 0.00001 -outfmt 5  -max_hsps 1 -qcov_hsp_perc 0.8 -num_threads 3 -out %s"
        execute(cmd % (fasta_query, fasta_db, blast_result))
        return blast_result
Ejemplo n.º 12
0
 def build_profile(seq_fasta, database, iterations, pssm_file, cpu):
     execute(
         "psiblast -query {input} -db {database} -num_threads {cpu} -out_pssm {output} -evalue 0.0001 -num_iterations {iterations} > {cmd_out}",
         output=pssm_file,
         iterations=iterations,
         database=database,
         input=seq_fasta,
         cpu=cpu,
         cmd_out=pssm_file + ".out")
Ejemplo n.º 13
0
def download_file(complete_url, target, ovewrite=False, retries=3, timeout=20):
    if not target.strip():
        target = "./"
    if not os.path.exists(os.path.dirname(os.path.abspath(target))):
        raise FileNotFoundError("%s does not exists" % os.path.dirname(target))
    if os.path.exists(target) and not ovewrite:
        raise OvewriteFileException("%s already exists" % target)

    execute(
        f'wget  --timeout={timeout} --tries={retries} -O "{target}" "{complete_url}"'
    )
Ejemplo n.º 14
0
 def fastqc(source_dir, dst_dir):
     mkdir(dst_dir)
     for filename in tqdm(
             sorted(
                 glob(source_dir + "/*.fastq") +
                 glob(source_dir + "/*.fastq.gz") +
                 glob(source_dir + "/*.fq") +
                 glob(source_dir + "/*.fq.gz"))):
         execute("fastqc  {src} -q --extract -o {dst}",
                 src=filename,
                 dst=dst_dir)
Ejemplo n.º 15
0
 def update_pdb(self, pdb):
     pdb = pdb.lower()
     mkdir(self.pdbs_dir + pdb[1:3])
     if os.path.exists(self.pdb_path_gzipped(pdb)):
         execute("gunzip " + self.pdb_path_gzipped(pdb))
     elif not os.path.exists(self.pdb_path(pdb)):
         download_file(
             self.url_pdb_files + pdb[1:3] + "/pdb" + pdb +
             self.pdb_download_extention, self.pdbs_dir + pdb[1:3] +
             "/pdb" + pdb + self.pdb_download_extention)
         execute("gunzip " + self.pdb_path_gzipped(pdb))
Ejemplo n.º 16
0
    def download_deg(dst="/data/databases/deg/"):
        for x in ["p", "e", "a"]:
            filename = "deg-" + x + "-15.2"

            download_file("http://tubic.tju.edu.cn/deg/download/" + filename +
                          ".zip",
                          dst + filename + ".zip",
                          ovewrite=True)
            execute("unzip -o  " + dst + filename + ".zip" + " -d " + dst)
            os.remove(dst + filename + ".zip")
            execute("makeblastdb -dbtype prot -in " + dst + "degaa-" + x +
                    ".dat")
Ejemplo n.º 17
0
    def combineGVCFs(vcfs_folder, output_gvcf, ref_path):
        """

        :param vcfs_path_list: list of paths of the vcf files
        :param gvcf_path: gvcf to be created
        :param ref_path: fasta from the reference genome
        :return:
        """
        cmd_template = """
        docker run --rm -w /out {mount2} -v {vcfs_folder}:/out/vcfs/ -v {ref_folder}:/out/ref/ broadinstitute/gatk:4.1.0.0 \
            java -jar /gatk/gatk-package-4.1.0.0-local.jar  CombineGVCFs    -R /out/ref/{ref_file} {vcfs} \
            -O {out_path}/{out_file} 
        """

        ref_folder = os.path.dirname(ref_path)
        ref_file = os.path.basename(ref_path)
        out_folder = os.path.dirname(output_gvcf)
        out_file = os.path.basename(output_gvcf) + ".bk"

        vcfs_path = "/out/vcfs/"
        if vcfs_folder == out_folder:
            mount2 = ""
            out_path = "/out/vcfs/"
        else:
            out_path = "/out/out/"
            mount2 = " -v {out_folder}:/out/out/ ".format(
                out_folder=out_folder)

        vcfs = " ".join([
            "--variant {vcfs_path}".format(vcfs_path=vcfs_path) + x
            for x in os.listdir(vcfs_folder)
            if x.endswith(".vcf") or x.endswith(".vcf.gz")
        ])
        cmd = cmd_template.format(vcfs=vcfs,
                                  out_folder=out_folder,
                                  out_file=out_file,
                                  mount2=mount2,
                                  out_path=out_path,
                                  ref_folder=ref_folder,
                                  ref_file=ref_file,
                                  vcfs_folder=vcfs_folder)
        print(cmd)
        execute(cmd)
        with open(out_folder + "/" + out_file) as h, open(output_gvcf,
                                                          "w") as hw:
            for l in h:
                if l.startswith("#CHROM"):
                    vec = l.split("\t")
                    l = "\t".join(vec[:9] +
                                  [x.split(".variant")[0]
                                   for x in vec[9:]]) + "\n"

                hw.write(l)
Ejemplo n.º 18
0
    def combineGVCFs(vcfs_folder,
                     output_gvcf,
                     ref_path,
                     tmp="/tmp/combineGVCFs.vcf"):
        """

        :param vcfs_path_list: list of paths of the vcf files
        :param gvcf_path: gvcf to be created
        :param ref_path: fasta from the reference genome
        :return:
        """

        assert os.path.exists(ref_path), f'{ref_path} does no exists'
        assert os.path.exists(vcfs_folder), f'{vcfs_folder} does no exists'
        vcfs_folder = os.path.abspath(vcfs_folder)
        if not hasattr(output_gvcf, "write"):
            assert os.path.exists(os.path.dirname(
                output_gvcf)), f'{os.path.dirname(output_gvcf)} does no exists'

        vcf_files = []
        for x in glob(vcfs_folder + "/*vcf*"):
            if x.endswith(".vcf") or x.endswith(".vcf.gz") or x.endswith(
                    ".gvcf") or x.endswith(".gvcf.gz"):
                vcf_files.append(x)

        if not vcf_files:
            raise FileNotFoundError(
                f'no .vcf or .vcf.gz files where found at {vcfs_folder}')

        vcfs = " ".join([f"--variant {x}" for x in vcf_files])

        cmd = f"""
        gatk CombineGVCFs -R {ref_path} {vcfs} -O {tmp}
        """

        execute(cmd)

        with open(tmp) as h:
            if hasattr(output_gvcf, "write"):
                hw = output_gvcf
            else:
                hw = open(output_gvcf, "w")
            try:
                for l in h:
                    if l.startswith("#CHROM"):
                        vec = l.split("\t")
                        l = "\t".join(
                            vec[:9] +
                            [x.split(".variant")[0] for x in vec[9:]])
                    hw.write(l)
            finally:
                hw.close()
Ejemplo n.º 19
0
 def update_pdb(self, pdb):
     pdb = pdb.lower()
     mkdir(self.pdbs_dir + pdb[1:3])
     if not os.path.exists(self.pdb_path(pdb)) or (os.path.getsize(self.pdb_path(pdb)) < 100):
         if os.path.exists(self.pdb_path_gzipped(pdb)) and (os.path.getsize(self.pdb_path_gzipped(pdb)) > 100):
             execute("gunzip " + self.pdb_path_gzipped(pdb))
             if os.path.exists(self.pdb_path_gzipped(pdb)) and not os.path.exists(self.pdb_path(pdb)):
                 os.remove(self.pdb_path_gzipped(pdb))
         elif not os.path.exists(self.pdb_path(pdb)):
             download_file(self.url_pdb_files + pdb[1:3] + "/pdb" + pdb + self.pdb_download_extention,
                           self.pdbs_dir + pdb[1:3] + "/pdb" + pdb + self.pdb_download_extention, ovewrite=True)
             execute("gunzip " + self.pdb_path_gzipped(pdb))
     return self.pdb_path(pdb)
    def test_residues_mapping(self):
        stdout = tempfile.NamedTemporaryFile()
        execute(
            "python3 -m SNDG.Structure.StructureVariant residues -i ./test/prot2.fasta --pdb_data /tmp/data -s 2VYI",
            stdout=stdout)

        with open(stdout.name) as output:
            contents = output.read()
        stdout.close()
        expected = ' '.join(
            """pdb     chain   resid   alt     ref     pos     pdb_pos
2vyi    A       81      P       G       23      1
2vyi    A       160     K       A       160     77
2vyi    B       160     K       A       160     74""".split())
        self.assertEqual(expected, ' '.join(contents.strip().split()))
Ejemplo n.º 21
0
    def load_msa(self, input_sequence, pdb_code, pdb_chain=None):
        pdb_code = pdb_code.lower()
        self.utils.update_pdb(pdb_code)
        self.ref_seq = bpio.read(input_sequence, "fasta")
        self.pdbfile = PDBFile(pdb_code, self.utils.pdb_path(pdb_code))
        with open(self.seqs_path, "w") as h:
            bpio.write(self.ref_seq, h, "fasta")
            bpio.write(self.pdbfile.seq(selected_chain=pdb_chain), h, "fasta")

        cmd = docker_wrap_command(
            f'mafft --quiet --localpair --maxiterate 1000 {self.seqs_path} > {self.aln_path} '
        )
        execute(cmd)

        self.msa = MSAMap.from_msa(self.aln_path)
        self.res_map = self.pdbfile.residues_map(pdb_chain)
Ejemplo n.º 22
0
    def hunt_pockets(self):
        abs_path = os.path.abspath(self.pdb_file_path)
        pdb_file = os.path.basename(abs_path)
        pdb_dir = os.path.dirname(abs_path)

        cmd = "docker run -u $(id -u):$(id -g) -w /out -v '{pdb_dir}':/out --rm ezequieljsosa/fpocket {fpocket} -f '{pdb_file}'".format(
            fpocket=self.fpocket_binary, pdb_file=pdb_file, pdb_dir=pdb_dir)
        self._execute(cmd)
        if os.path.abspath(self._pdb_file_directory) != os.path.abspath(
                self.work_directory):
            if os.path.exists(self.dest_path()):
                shutil.rmtree(self.dest_path(), True)
            work_dir = self._pdb_file_directory + "/" + self._out_directory()
            if os.path.exists(work_dir):
                execute(f'mv "{work_dir}" "{self.dest_path()}"')
        result = FpocketOutput(self.dest_path())
        result.parse()
        return result
Ejemplo n.º 23
0
 def phylo(vcf, output):
     cmd = f"""bcftools filter -i 'alt=\"*\"' {vcf}  | bcftools norm -m -any | \
      bcftools filter -e 'alt=\"*\"'  | bcftools filter -i 'FORMAT/AD[*:1]>15' | \
      sed  's|0/1:|1/1|'  | sed  's|0\|1:|1/1|'  > /tmp/spaning_del.vcf"""
     execute(cmd)
     cmd = f"bcftools filter -e 'alt=\"*\"' {vcf}  > /tmp/no_spanning.vcf"
     execute(cmd)
     cmd = f"bcftools view /tmp/spaning_del.vcf | grep -v '^#' >> /tmp/no_spanning.vcf"
     execute(cmd)
     cmd = f"bcftools sort /tmp/no_spanning.vcf  > {output}"
     execute(cmd)
Ejemplo n.º 24
0
    def annotate(self,
                 fasta_path,
                 output,
                 training=None,
                 locustag="PROKKA",
                 gram=None,
                 genus="",
                 species="",
                 strain="",
                 kingdom="Bacteria",
                 gcode=0,
                 rfam=False,
                 increment=5,
                 prefix="ann",
                 cpus=1,
                 centre=""):

        if not os.path.exists(os.path.abspath(output + "/../")):
            raise FileNotFoundError(
                f'{os.path.abspath(output + "/../")} not found, cant create output dir'
            )
        if not os.path.exists(fasta_path):
            raise FileNotFoundError(f'{os.path.abspath(fasta_path)} not found')

        if training and not os.path.exists(training):
            raise FileNotFoundError(f'{os.path.abspath(training)} not found')

        db = f"--proteins {training}" if training else ""
        rfam = f"--rfam" if rfam else ""
        species = f"--species '{species}'" if species else ""
        strain = f"--strain '{strain}'" if strain else locustag
        kingdom = f"--kingdom '{kingdom}'" if kingdom else ""
        gram = f"--gram {gram}" if gram else ""
        centre = f"--centre '{centre}'" if centre else ""
        genus = f"--genus '{genus}'" if genus else ""

        cmd = f'''prokka --compliant --cpus {cpus} {gram} --addgenes {rfam} --locustag {locustag} --outdir {output} \
                  --prefix {prefix} --force {db} {fasta_path} {kingdom} {strain}  \
                  --increment {increment} --gcode {gcode}  {centre} {genus} {species}'''

        cmd2 = docker_wrap_command(cmd)
        execute(cmd2)
Ejemplo n.º 25
0
    def accpro(fasta, path, cmd="/opt/sspro4/bin/predict_acc.sh"):
        """
        #../bin/predict_ssa.sh 1aqta.fasta 1aqta.test
        cat /opt/sspro4/test/1aqta.acc6

        1aqta_fastaalg
STYHLDVVSAEQQMFSGLVEKIQVTGSEGELGIYPGHAPLLTAIKPGMIRIVKQHGHEEFIYLSGGILEVQPGNVTVLADTAIRGQDLDEARAMEAKRKAEEHISSSHGDVDYAQASAELAKAIAQLRVIELTKK
eebebbbbbbeeebbeeebeebbbebeebbbbbbbebbbbbbbbebbbbbbebeeeeebbbbbbbbbbbbeeeebbbbbbbbeeeeebeeeebeebbeebbeebeeeeeeeebeebeebbeebbebbebbeeeee


        :param fasta:
        :param path:
        :param cmd:
        :return:
        """
        acc6 = fasta.replace(".fasta", ".acc6")
        if not os.path.exists(acc6):
            execute(cmd + " " + fasta + " " + acc6, wd=path)

        with open(acc6) as h:
            return h.readlines()[2].strip()
    def test_residues_ann(self):
        stdout = tempfile.NamedTemporaryFile()
        execute(
            "cat test/test.tbl | python -m SNDG.Structure.StructureVariant ann --pdb_data /tmp/data ",
            stdout=stdout)
        with open(stdout.name) as output:
            contents = output.read()
        stdout.close()
        anns = [json.loads(x) for x in contents.split("###") if x.strip()]
        self.assertEqual(4, len(anns))
        r2vyi_B_160_K = [
            x["ann"] for x in anns if x["residue"] == "2vyi_B_160_K"
        ][0]

        self.assertEqual(0.705, r2vyi_B_160_K["pockets"][0]["druggabilitty"])

        r1azm_A_91_P = [
            x["ann"] for x in anns if x["residue"] == "1azm_A_91_P"
        ][0]
        self.assertTrue("BINDING SITE FOR RESIDUE AZM A 262" in
                        [x["details"] for x in r1azm_A_91_P["binding"]])
Ejemplo n.º 27
0
    def download_proteome_from_tax(tax_id, dst_dir, format="fasta"):

        durl = 'http://www.uniprot.org/uniprot/?sort=&desc=&compress=yes&query=taxonomy:{tax}&fil=&format={format}&force=yes'
        download_file(durl.format(tax=tax_id, format=format),
                      dst_dir + "/" + tax_id + "_all.fasta.gz",
                      ovewrite=True)
        execute("gunzip " + dst_dir + "/" + tax_id + "_all.fasta.gz")
        execute("cd-hit -M 0 -c 0.9 -T 0 -i %s -o %s" %
                (dst_dir + "/" + tax_id + "_all.fasta",
                 dst_dir + "/" + tax_id + ".fasta"))
        execute("makeblastdb -dbtype prot -in " + dst_dir + "/" + tax_id +
                ".fasta")
Ejemplo n.º 28
0
        (time.time() - os.path.getatime(filepath)) / 60 / 60 / 24) > period)


#os.environ["http_proxy"] = "http://proxy.fcen.uba.ar:8080"
#os.environ["ftp_proxy"] = "http://proxy.fcen.uba.ar:8080"

mkdir("/data/pdb/")
download_file("ftp://ftp.wwpdb.org/pub/pdb/derived_data/index/entries.idx",
              "/data/pdb/entries.idx",
              ovewrite=True)

pdbs = PDBs("/data/pdb/")
pdbs.download_pdb_seq_ses()
pdbs.update_pdb_dir()
mkdir("/data/pdb/processed/")
pdbs.pdbs_seq_for_modelling()
execute("makeblastdb -dbtype prot -in /data/pdb/processed/seqs_from_pdb.fasta")

if old_or_inexistent("/data/uniprot/uniref/uniref90/uniref90.fasta"):
    mkdir("/data/uniprot/uniref/uniref90")
    download_file(
        "ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz",
        "/data/uniprot/uniref/uniref90/uniref90.fasta.gz",
        ovewrite=True)
    execute("gunzip /data/uniprot/uniref/uniref90/uniref90.fasta.gz")

if old_or_inexistent("/data/uniprot/uniref/uniref90/uniref90.fasta.pal"):
    execute(
        "makeblastdb -dbtype prot -in /data/uniprot/uniref/uniref90/uniref90.fasta"
    )
Ejemplo n.º 29
0
 def quast(glob_exp, out, ref=None):
     execute("quast " + glob_exp + " -o " + out +
             ((" -R  " + ref) if ref else ""))
Ejemplo n.º 30
0
os.environ["http_proxy"] = "http://proxy.fcen.uba.ar:8080"
os.environ["ftp_proxy"] = "http://proxy.fcen.uba.ar:8080"



if not os.path.exists("/data/cog/whog"):
    mkdir("/data/cog/")
    download_file("ftp://ftp.ncbi.nih.gov/pub/COG/COG/whog",
                  "/data/cog/whog")

if not os.path.exists("/data/cog/myva"):
    mkdir("/data/cog/")
    download_file("ftp://ftp.ncbi.nih.gov/pub/COG/COG/myva",
                  "/data/cog/myva")
    execute("formatdb -i /data/cog/myva -o T")

if not os.path.exists("/data/ec/PRIAM_MAR15/priam"):
    mkdir("/data/ec/")
    download_file("http://priam.prabi.fr/REL_MAR15/Distribution.zip",
                  "/data/ec/PRIAM_MAR15.zip")
    execute_from("unzip /data/ec/PRIAM_MAR15.zip; exit 0;", "/data/ec/",retcodes=[0,1])

    execute_from("ls /data/ec/PRIAM_MAR15/PROFILES/*.chk > priam", "/data/ec/PRIAM_MAR15/")
    execute_from("formatrpsdb -i /data/ec/PRIAM_MAR15/priam -o T", "/data/ec/PRIAM_MAR15/")

if not os.path.exists("/data/pfamtigrfam/tirgfam.hmm"):
    mkdir("/data/pfamtigrfam/INFO")
    download_file("ftp://ftp.jcvi.org/pub/data/TIGRFAMs/TIGRFAMs_15.0_HMM.LIB.gz",
                  "/data/pfamtigrfam/TIGRFAMs_15.0_HMM.LIB.gz")
    execute("gunzip /data/pfamtigrfam/TIGRFAMs_15.0_HMM.LIB.gz",retcodes=[0,2])