Esempio n. 1
0
def download_if_changed(job_runner, local_path, ftp_host, ftp_path):
    remote_changed_time = get_remote_file_changed_time(ftp_host, ftp_path)
    local_changed_time = os.path.getmtime(local_path) if os.path.isfile(local_path) else 0
    ftp_address = "ftp://%s/%s" % (ftp_host, ftp_path)
    if remote_changed_time > local_changed_time:
        job_runner.add_parallel(pypez.Job("wget %s -O OUT:%s" % (ftp_address, local_path)))
    else:
	print("Local copy of %s is up to date. The remote version hasn't changed since %s" % (ftp_address, datetime.fromtimestamp(remote_changed_time)))
Esempio n. 2
0
    download_if_changed(jr, clinvar_xml,  "ftp.ncbi.nlm.nih.gov", "/pub/clinvar/xml/ClinVarFullRelease_00-latest.xml.gz")

if clinvar_variant_summary_table:
    if not os.path.isfile(clinvar_variant_summary_table):
        p.error("ClinVar variant summary table specified but not found: %s" % clinvar_variant_summary_table)
    if not clinvar_variant_summary_table.endswith('.gz'):
        p.error("ClinVar variant summary table expected to be gzipped: %s" % clinvar_variant_summary_table)
    variant_summary_table = clinvar_variant_summary_table
else:
    print("Checking for new clinvar release")
    variant_summary_table = "variant_summary.txt.gz"
    download_if_changed(jr, variant_summary_table,  "ftp.ncbi.nlm.nih.gov", "/pub/clinvar/tab_delimited/variant_summary.txt.gz")

jr.run()

job = pypez.Job()

# normalize (convert to minimal representation and left-align)
# the normalization code is in a different repo (useful for more than just clinvar) so here I just wget it:
job.add("wget -N https://raw.githubusercontent.com/ericminikel/minimal_representation/master/normalize.py")

for genome_build in ('b37', 'b38'):
    # extract the GRCh37 coordinates, mutant allele, MeasureSet ID and PubMed IDs from it. This currently takes about 20 minutes.
    genome_build_id = genome_build.replace('b', 'GRCh')
    reference_genome = reference_genomes[genome_build]

    job.add(("python -u IN:parse_clinvar_xml.py "
            "-x IN:%(clinvar_xml)s "
            "-g %(genome_build_id)s "
            "-o OUT:%(tmp_dir)s/clinvar_table_raw.single.%(genome_build)s.tsv "
            "-m OUT:%(tmp_dir)s/clinvar_table_raw.multi.%(genome_build)s.tsv") % locals())