Esempio n. 1
0
    def download(self, datadir):
        self.bolddir = datadir + "bold/"

        if not os.path.exists(self.bolddir):
            os.makedirs(self.bolddir)
        os.chdir(self.bolddir)
        download_file(
            "http://www.boldsystems.org/index.php/API_Public/combined?geo=Argentina&format=json",
            "barcodes.json")
Esempio n. 2
0
 def update_pdb(self, pdb):
     pdb = pdb.lower()
     mkdir(self.pdbs_dir + pdb[1:3])
     if os.path.exists(self.pdb_path_gzipped(pdb)):
         execute("gunzip " + self.pdb_path_gzipped(pdb))
     elif not os.path.exists(self.pdb_path(pdb)):
         download_file(
             self.url_pdb_files + pdb[1:3] + "/pdb" + pdb +
             self.pdb_download_extention, self.pdbs_dir + pdb[1:3] +
             "/pdb" + pdb + self.pdb_download_extention)
         execute("gunzip " + self.pdb_path_gzipped(pdb))
Esempio n. 3
0
    def download_deg(dst="/data/databases/deg/"):
        for x in ["p", "e", "a"]:
            filename = "deg-" + x + "-15.2"

            download_file("http://tubic.tju.edu.cn/deg/download/" + filename +
                          ".zip",
                          dst + filename + ".zip",
                          ovewrite=True)
            execute("unzip -o  " + dst + filename + ".zip" + " -d " + dst)
            os.remove(dst + filename + ".zip")
            execute("makeblastdb -dbtype prot -in " + dst + "degaa-" + x +
                    ".dat")
Esempio n. 4
0
    def download_proteome_from_tax(tax_id, dst_dir, format="fasta"):

        durl = 'http://www.uniprot.org/uniprot/?sort=&desc=&compress=yes&query=taxonomy:{tax}&fil=&format={format}&force=yes'
        download_file(durl.format(tax=tax_id, format=format),
                      dst_dir + "/" + tax_id + "_all.fasta.gz",
                      ovewrite=True)
        execute("gunzip " + dst_dir + "/" + tax_id + "_all.fasta.gz")
        execute("cd-hit -M 0 -c 0.9 -T 0 -i %s -o %s" %
                (dst_dir + "/" + tax_id + "_all.fasta",
                 dst_dir + "/" + tax_id + ".fasta"))
        execute("makeblastdb -dbtype prot -in " + dst_dir + "/" + tax_id +
                ".fasta")
Esempio n. 5
0
 def update_pdb(self, pdb):
     pdb = pdb.lower()
     mkdir(self.pdbs_dir + pdb[1:3])
     if not os.path.exists(self.pdb_path(pdb)) or (os.path.getsize(self.pdb_path(pdb)) < 100):
         if os.path.exists(self.pdb_path_gzipped(pdb)) and (os.path.getsize(self.pdb_path_gzipped(pdb)) > 100):
             execute("gunzip " + self.pdb_path_gzipped(pdb))
             if os.path.exists(self.pdb_path_gzipped(pdb)) and not os.path.exists(self.pdb_path(pdb)):
                 os.remove(self.pdb_path_gzipped(pdb))
         elif not os.path.exists(self.pdb_path(pdb)):
             download_file(self.url_pdb_files + pdb[1:3] + "/pdb" + pdb + self.pdb_download_extention,
                           self.pdbs_dir + pdb[1:3] + "/pdb" + pdb + self.pdb_download_extention, ovewrite=True)
             execute("gunzip " + self.pdb_path_gzipped(pdb))
     return self.pdb_path(pdb)
Esempio n. 6
0
    def download_assembly(assembly_accession,
                          dst_dir,
                          dtype="genomic.gbff.gz",
                          force=False):
        # assembly_name, last_assembly_accession = NCBI.assembly_name_from_acc(assembly_accession)
        assembly_accession_no_ver = assembly_accession if assembly_accession[
            -2] != "." else assembly_accession[:-2]

        # https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/158/435/
        url = "/".join([
            ftp_url, assembly_accession_no_ver[0:3],
            assembly_accession_no_ver[4:7], assembly_accession_no_ver[7:10],
            assembly_accession_no_ver[10:13]
            # , last_assembly_accession + "_" + assembly_name.replace(" ", "_").replace("#", "_")
            # , last_assembly_accession + "_" + assembly_name.replace(" ", "_").replace("#",
            #                                                                           "_") + "_" + dtype
        ]) + "/"
        r = requests.get(url)
        download_url = ""
        acc = ""
        if r.status_code == 200:
            accessions = [
                x.split("</")[0].replace("/", "") for x in r.text.split(">")
                if x.startswith(assembly_accession_no_ver)
            ]
            # GCF_000158435.(1)_ASM15843v1/
            accessions = sorted(accessions,
                                key=lambda x: int(x.split("_")[1][-1]))
            if accessions:
                acc = accessions[-1]
                download_url = f'{url}{acc}/{acc}_{dtype}'

        if not download_url:
            err = f"{assembly_accession} not found at {url}"
            _log.error(err)
            raise FileNotFoundError(err)

        assert acc

        out_file = f'{dst_dir}/{"_".join(acc.split("_")[:2]) }.{dtype}'
        if force or not os.path.exists(out_file):
            download_file(download_url, out_file, ovewrite=force)
        else:
            _log.debug(f'{out_file} exists')
        # execute("gunzip -c  " + out_file + " > " +  out_file[:-3])
        return out_file
Esempio n. 7
0
 def cross_reference_dbs(self):
     download_file(Command.DEFAULT_CROSS_REF_DBS,
                   "data/tmp/database-all.rdf")
     with open("data/tmp/database-all.rdf") as h:
         data = xmltodict.parse(h.read(), "utf-8")
         for db in data["rdf:RDF"]["rdf:Description"]:
             DBx.objects.get_or_create(
                 url=db['@rdf:about'],
                 name=db['abbreviation']
                 if "abbreviation" in db else db['dcterms:identifier'],
                 category=db.get('category', ""),
                 description=db.get('rdfs:label', ""),
                 url_template=db.get('urlTemplate',
                                     db['dcterms:identifier']))
     DBx.objects.get_or_create(
         url="www.uniprot.org",
         name="UnipAcc",
         category='Protein annotation databases',
         description="UNIPROT",
         url_template="https://www.uniprot.org/uniprot/%s",
     )
Esempio n. 8
0
    def handle(self, *args, **options):
        Ontology.load_ann_terms()
        Ontology.load_go_base()

        if options["go"]:
            if not os.path.exists(options["obo_path"]):
                download_file(options["go_url"], options["obo_path"])
            if not os.path.exists(options["relationships_obo_path"]):
                download_file(options["go_basic_url"],
                              options["relationships_obo_path"])

            self.ontology = Ontology.objects.get(name=Ontology.GO)
            self.is_a = Ontology.relmap["is_a"]

            self.create_terms(options["obo_path"], "go")
            self.create_relationships(options["relationships_obo_path"], "go")

        if options["tax"]:
            pass

        self.stderr.write("Finished!")
Esempio n. 9
0
 def download_fasta(url_unip2reactions=DEFAULT_UNIP2REACTIONS,
                    outdir="/data/databases/reactome/",
                    ovewrite=False):
     unip_utils = Uniprot()
     assert os.path.exists(outdir), f'{outdir} does not exists'
     reactome_map_file = outdir + "/UniProt2ReactomeReactions.txt"
     if ovewrite or not os.path.exists(reactome_map_file):
         download_file(url_unip2reactions,
                       reactome_map_file,
                       ovewrite=ovewrite)
     else:
         sys.stderr.write(f'{reactome_map_file} already exists')
     with open(reactome_map_file) as hr, gzip.open("seqs.fasta.gz",
                                                   "wt") as hw:
         for line in tqdm(hr):
             if not line.startswith("#"):
                 unip, reactome, url_path, description = line.split(
                     "\t")[:4]
                 record = unip_utils.download_and_load_seqrecord(
                     unip, format=".fasta")
                 record.name = ""
                 record.description = description + "||" + unip
                 record.id = reactome
                 bpio.write(record, hw, "fasta")
Esempio n. 10
0
    def download_ena_project(project_id, dst_dir):
        dst_dir = os.path.abspath(dst_dir)
        url_template = "https://www.ebi.ac.uk/ena/data/warehouse/filereport?accession=" + project_id + "&result=read_run&fields=sample_accession,experiment_accession,run_accession,fastq_ftp,fastq_md5&download=txt"
        r = requests.get(url_template)
        if r.status_code == 200:
            lines = r.text.split("\n")
            with tqdm(lines) as pbar:
                for l in pbar:

                    if len(l.strip().split("\t")) > 3:
                        sample_accession, experiment_accession, run_accession, fastq_ftp, fastq_md5 = l.split(
                            "\t")
                        if len(fastq_ftp.split(";")) == 2:
                            basefilename = dst_dir + "/" + "_".join([
                                sample_accession, experiment_accession,
                                run_accession
                            ])

                            if (not os.path.exists(basefilename +
                                                   "_1.fastq.gz")
                                ) and (not os.path.exists(basefilename +
                                                          "_1.fastq")):
                                pbar.set_description(fastq_ftp.split(";")[0])
                                try:
                                    download_file(
                                        fastq_ftp.split(";")[0],
                                        basefilename + "_1.fastq.gz")
                                except:
                                    _log.warn("error downloading: " +
                                              basefilename + "_1.fastq.gz")
                                    try:
                                        os.rmdir(basefilename + "_1.fastq.gz")
                                    except:
                                        pass
                            f1md5 = sp.check_output(
                                "md5sum %s_1.fastq.gz" % basefilename,
                                shell=True).split()[0].strip()
                            if fastq_md5.split(";")[0] != f1md5:
                                print("%s error md5 sum" % basefilename)
                                try:
                                    os.rmdir(basefilename + "_1.fastq.gz")
                                except:
                                    pass

                            if (not os.path.exists(basefilename +
                                                   "_2.fastq.gz")
                                ) and (not os.path.exists(basefilename +
                                                          "_2.fastq")):
                                pbar.set_description(fastq_ftp.split(";")[1])
                                try:
                                    download_file(
                                        fastq_ftp.split(";")[1],
                                        basefilename + "_2.fastq.gz")
                                except:
                                    _log.warn("error downloading: " +
                                              basefilename + "_2.fastq.gz")
                                    try:
                                        os.rmdir(basefilename + "_2.fastq.gz")
                                    except:
                                        pass
                            f1md5 = sp.check_output(
                                "md5sum %s_2.fastq.gz" % basefilename,
                                shell=True).split()[0].strip()
                            if fastq_md5.split(";")[1] != f1md5:
                                print("%s error md5 sum" % basefilename)
                                try:
                                    os.rmdir(basefilename + "_2.fastq.gz")
                                except:
                                    pass

        else:
            raise Exception("request error %i" % r.status_code)
Esempio n. 11
0
init_log("/tmp/createdb.log")


def old_or_inexistent(filepath, period=30):
    return not os.path.exists(filepath) or (((time.time() - os.path.getatime(filepath)) / 60 / 60 / 24) > period)


os.environ["http_proxy"] = "http://proxy.fcen.uba.ar:8080"
os.environ["ftp_proxy"] = "http://proxy.fcen.uba.ar:8080"



if not os.path.exists("/data/cog/whog"):
    mkdir("/data/cog/")
    download_file("ftp://ftp.ncbi.nih.gov/pub/COG/COG/whog",
                  "/data/cog/whog")

if not os.path.exists("/data/cog/myva"):
    mkdir("/data/cog/")
    download_file("ftp://ftp.ncbi.nih.gov/pub/COG/COG/myva",
                  "/data/cog/myva")
    execute("formatdb -i /data/cog/myva -o T")

if not os.path.exists("/data/ec/PRIAM_MAR15/priam"):
    mkdir("/data/ec/")
    download_file("http://priam.prabi.fr/REL_MAR15/Distribution.zip",
                  "/data/ec/PRIAM_MAR15.zip")
    execute_from("unzip /data/ec/PRIAM_MAR15.zip; exit 0;", "/data/ec/",retcodes=[0,1])

    execute_from("ls /data/ec/PRIAM_MAR15/PROFILES/*.chk > priam", "/data/ec/PRIAM_MAR15/")
    execute_from("formatrpsdb -i /data/ec/PRIAM_MAR15/priam -o T", "/data/ec/PRIAM_MAR15/")
Esempio n. 12
0
 def download_human_prots(dst="/data/databases/human/"):
     file_path = dst + Offtarget.DEFAULT_HUMAN_FILENAME
     unip_url = "https://www.uniprot.org/uniref/?query=uniprot:(taxonomy:%22Homo%20sapiens%20(Human)%20[9606]%22)%20identity:1.0&format=fasta&force=true&compress=yes"
     download_file(unip_url, file_path, ovewrite=True, timeout=120)
     return file_path
Esempio n. 13
0
 def download_deg(dst="/data/databases/deg/"):
     for x in ["p", "e", "a"]:
         download_file(Offtarget.DEG_PROT_URL[x],
                       f"{dst}/{Offtarget.DEG_FAA_NAMES[x]}.gz",
                       ovewrite=True)
         execute(f"gunzip -f {dst}/{Offtarget.DEG_FAA_NAMES[x]}.gz")
Esempio n. 14
0
 def download_pdb_entries(self):
     download_file(self.url_pdb_entries, self.entries_path, ovewrite=True)
Esempio n. 15
0
 def download_pdb_seq_ses(self):
     download_file(self.url_pdb_seq_res,
                   self.pdb_seq_res_path,
                   ovewrite=True)
Esempio n. 16
0
 def download_fasta(uniprot_id, outdir="./", overwrite=False):
     download_file(Uniprot.DEFAULT_UNIPROT_URL + uniprot_id + ".fasta",
                   f'{outdir}/{uniprot_id}.fasta', overwrite)
Esempio n. 17
0
    def process_file(params):
        aln_file = params["aln_file"]
        templates2use = params["templates2use"]
        tmp_dir = params["tmp_dir"]
        output_dir = params["output_dir"]

        aln_file = aln_file.strip()
        try:
            if os.path.getsize(aln_file) < 100:
                return [{"errors": f'\n{aln_file} empty file'}]
            hsps = []
            try:
                hsps = [
                    hsp for query_result in bpsio.parse(
                        aln_file.strip(), "blast-xml") for hit in query_result
                    for hsp in hit
                ]
            except ValueError:
                sys.stderr.write(f"error reading alignments in {aln_file}")

            hsps = hsps[:templates2use]
            if hsps:
                seq_id = hsps[0].query.id
                # pdb_chains = [x.split("_") for x in set([hsp.hit.id[3:7] + "_" + hsp.hit.id[-1] for hsp in hsps])]
                pdb_chains = [[hsp.hit.id[3:7], hsp.hit.id[-1]]
                              for hsp in hsps]
                updated = True
                for pdb, _ in pdb_chains:
                    if not os.path.exists(pdb_utils.pdb_path(pdb)):
                        mkdir(pdb_utils.pdb_path_base(pdb))
                        download_file(
                            f"https://files.rcsb.org/download/{pdb.upper()}.pdb.gz",
                            pdb_utils.pdb_path_gzipped(pdb),
                            ovewrite=True)
                        pdb_utils.update_pdb(pdb)
                        updated = os.path.exists(pdb_utils.pdb_path(pdb))

                if not updated:
                    sys.stderr.write(f'{pdb} could not be updated...\n')
                    return
                pdb_utils.extract_chains(pdb_chains, tmp_dir)
                models_results = []
                for hsp in hsps:
                    try:
                        models_result = Modelome.model_hsps(
                            seq_id,
                            os.path.abspath(output_dir), [hsp],
                            refinement=REFINEMENT,
                            models_to_generate=MODELS_TO_GENERATE,
                            assessments=ASSESMENTS,
                            entries={},
                            tmp_dir=tmp_dir,
                            max_models=1)
                    except ModellerOverflowError as e:
                        sys.stderr.write(
                            f"error processing {seq_id}: {str(e)}")
                        continue
                    models_results.append(models_result)
                return models_results

            else:
                return [{"errors": f'\nno aligments for {aln_file}\n'}]
        except:
            sys.stderr.write(f'error processing {aln_file}')
            raise
Esempio n. 18
0
from SNDG.Structure.PDBs import PDBs

init_log("/tmp/createdb.log")


def old_or_inexistent(filepath, period=30):
    return not os.path.exists(filepath) or ((
        (time.time() - os.path.getatime(filepath)) / 60 / 60 / 24) > period)


#os.environ["http_proxy"] = "http://proxy.fcen.uba.ar:8080"
#os.environ["ftp_proxy"] = "http://proxy.fcen.uba.ar:8080"

mkdir("/data/pdb/")
download_file("ftp://ftp.wwpdb.org/pub/pdb/derived_data/index/entries.idx",
              "/data/pdb/entries.idx",
              ovewrite=True)

pdbs = PDBs("/data/pdb/")
pdbs.download_pdb_seq_ses()
pdbs.update_pdb_dir()
mkdir("/data/pdb/processed/")
pdbs.pdbs_seq_for_modelling()
execute("makeblastdb -dbtype prot -in /data/pdb/processed/seqs_from_pdb.fasta")

if old_or_inexistent("/data/uniprot/uniref/uniref90/uniref90.fasta"):
    mkdir("/data/uniprot/uniref/uniref90")
    download_file(
        "ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz",
        "/data/uniprot/uniref/uniref90/uniref90.fasta.gz",
        ovewrite=True)
Esempio n. 19
0
 def load_structure(self):
     tmp_dir = "/tmp/2PZI.pdb"
     download_file("https://files.rcsb.org/view/2PZI.pdb", target=tmp_dir, ovewrite=True)