def download_if_changed(job_runner, local_path, ftp_host, ftp_path): remote_changed_time = get_remote_file_changed_time(ftp_host, ftp_path) local_changed_time = os.path.getmtime(local_path) if os.path.isfile(local_path) else 0 ftp_address = "ftp://%s/%s" % (ftp_host, ftp_path) if remote_changed_time > local_changed_time: job_runner.add_parallel(pypez.Job("wget %s -O OUT:%s" % (ftp_address, local_path))) else: print("Local copy of %s is up to date. The remote version hasn't changed since %s" % (ftp_address, datetime.fromtimestamp(remote_changed_time)))
download_if_changed(jr, clinvar_xml, "ftp.ncbi.nlm.nih.gov", "/pub/clinvar/xml/ClinVarFullRelease_00-latest.xml.gz") if clinvar_variant_summary_table: if not os.path.isfile(clinvar_variant_summary_table): p.error("ClinVar variant summary table specified but not found: %s" % clinvar_variant_summary_table) if not clinvar_variant_summary_table.endswith('.gz'): p.error("ClinVar variant summary table expected to be gzipped: %s" % clinvar_variant_summary_table) variant_summary_table = clinvar_variant_summary_table else: print("Checking for new clinvar release") variant_summary_table = "variant_summary.txt.gz" download_if_changed(jr, variant_summary_table, "ftp.ncbi.nlm.nih.gov", "/pub/clinvar/tab_delimited/variant_summary.txt.gz") jr.run() job = pypez.Job() # normalize (convert to minimal representation and left-align) # the normalization code is in a different repo (useful for more than just clinvar) so here I just wget it: job.add("wget -N https://raw.githubusercontent.com/ericminikel/minimal_representation/master/normalize.py") for genome_build in ('b37', 'b38'): # extract the GRCh37 coordinates, mutant allele, MeasureSet ID and PubMed IDs from it. This currently takes about 20 minutes. genome_build_id = genome_build.replace('b', 'GRCh') reference_genome = reference_genomes[genome_build] job.add(("python -u IN:parse_clinvar_xml.py " "-x IN:%(clinvar_xml)s " "-g %(genome_build_id)s " "-o OUT:%(tmp_dir)s/clinvar_table_raw.single.%(genome_build)s.tsv " "-m OUT:%(tmp_dir)s/clinvar_table_raw.multi.%(genome_build)s.tsv") % locals())