def get_ontology_hpo(self, conf, output, riot): hpo_filename = self.download_converted_file(conf.etl.hpo, output, riot) hpo = HPO(hpo_filename) hpo.generate() create_output_dir(output.prod_dir + "/" + conf.etl.hpo.path) hpo.save_hpo(output.prod_dir + "/" + conf.etl.hpo.path + "/" + conf.etl.hpo.output_filename)
def get_ontology_mondo(self, conf, output, riot): mondo_filename = self.download_converted_file(conf.etl.mondo, output, riot) mondo = MONDO(mondo_filename) mondo.generate() create_output_dir(output.prod_dir + "/" + conf.etl.mondo.path) mondo.save_mondo(output.prod_dir + "/" + conf.etl.mondo.path + "/" + conf.etl.mondo.output_filename)
def get_hpo_phenotypes(self, conf, output): hpo_pheno_filename = Downloads.dowload_staging_http( output.staging_dir, conf.etl.hpo_phenotypes) hpo_phenotypes = HPOPhenotypes(hpo_pheno_filename) create_output_dir(output.prod_dir + "/" + conf.etl.hpo_phenotypes.path) hpo_phenotypes.run(output.prod_dir + "/" + conf.etl.hpo_phenotypes.path + "/" + conf.etl.hpo_phenotypes.output_filename)
def create_output_structure(self, output_dir): """By default the directories prod and staging are created""" remove_output_dir( output_dir) if self.args.force_clean else logger.info( "Warning: Output not deleted.") self.yaml.outputs.prod_dir = create_output_dir(output_dir + '/prod') self.yaml.outputs.staging_dir = create_output_dir(output_dir + '/staging')
def process(self, conf, output, cmd_conf): riot = Riot(cmd_conf) filename_input = Downloads.dowload_staging_http( output.staging_dir, conf.etl) file_ouput_path = os.path.join(output.prod_dir, conf.etl.path) create_output_dir(file_ouput_path) riot.convert_owl_to_jsonld(filename_input, file_ouput_path, conf.etl.owl_jq)
def get_project_scores(self, project_score_entry, output): logger.info("Downloading project scores target files") # we only want one file from a zipped archive file_of_interest = 'EssentialityMatrices/04_binaryDepScores.tsv' file_input = Downloads.dowload_staging_http(output.staging_dir, project_score_entry) output_dir = os.path.join(output.prod_dir, project_score_entry.path) create_output_dir(output_dir) extract_file_from_zip(file_of_interest, file_input, output_dir)
def get_ontology_EFO(self, conf, output, riot): efo_filename = self.download_converted_file(conf.etl.efo, output, riot) efo = EFO(efo_filename) efo.generate() create_output_dir(output.prod_dir + "/" + conf.etl.efo.path) efo.save_static_disease_file(output.prod_dir + "/" + conf.etl.efo.path + "/" + conf.etl.efo.diseases_static_file) efo.save_diseases(output.prod_dir + "/" + conf.etl.efo.path + "/" + conf.etl.efo.output_filename)
def process(self, conf, output, cmd_conf): download = DownloadResource(output.staging_dir) uri_release = conf.uri.replace("{release}", str(conf.release)) create_output_dir( os.path.join(output.prod_dir, conf.path, str(conf.release))) jq_cmd = Utils.check_path_command("jq", cmd_conf.jq) for species in conf.resources: logger.debug(f'Downloading files for {species}') filename_json = self.download_species(uri_release, conf.release, output.staging_dir, download, species) self.extract_fields_from_json(filename_json, conf, output, jq_cmd)
def _download_selected_event_files(self, repo_metadata, output): downloaded_files = dict() # Body if repo_metadata: logger.info("OpenFDA FAERs metadata received") fda_output = create_output_dir( os.path.join(output.prod_dir, "fda-inputs")) fda = OpenfdaHelper(fda_output) # Parallel data gathering logger.info("Prepare download pool of {} processes".format( mp.cpu_count())) download_pool = mp.Pool(mp.cpu_count()) logger.info(mp.current_process()) try: for _ in tqdm.tqdm(download_pool.map( fda._do_download_openfda_event_file, repo_metadata['results']['drug']['event'] ['partitions']), total=len(repo_metadata['results']['drug'] ['event']['partitions'])): logger.info('\rdone {0:%}'.format( _ / len(repo_metadata['results']['drug']['event'] ['partitions']))) except Exception as e: logger.info("Something went wrong: " + str(e)) return downloaded_files
def get_normal_tissues(self, output, resource): filename = Downloads.dowload_staging_http(output.staging_dir, resource) filename_unzip = make_unzip_single_file(filename) gzip_filename = os.path.join( create_output_dir(os.path.join(output.prod_dir, resource.path)), resource.output_filename.replace('{suffix}', self.suffix)) make_gzip(filename_unzip, gzip_filename)
def get_gnomad(self, gnomad, output): filename = Downloads.dowload_staging_http(output.staging_dir, gnomad) filename_unzip = make_ungzip(filename) gzip_filename = os.path.join( create_output_dir(os.path.join(output.prod_dir, gnomad.path)), gnomad.output_filename) make_gzip(filename_unzip, gzip_filename)
def save_tissue_translation_map(self, output_path, resource, filename): tissues_json = {} with URLZSource(filename).open(mode='rb') as r_file: tissues_json['tissues'] = json.load(r_file)['tissues'] r_file.close() create_output_dir(os.path.join(output_path, resource.path)) filename_tissue = os.path.join( output_path, resource.path, resource.output_filename.replace('{suffix}', self.suffix)) with jsonlines.open(filename_tissue, mode='w') as writer: for item in tissues_json['tissues']: entry = { k: v for k, v in tissues_json['tissues'][item].items() } entry['tissue_id'] = item writer.write(entry)
def get_subcellular_location(self, sub_location, output): filename = Downloads.dowload_staging_http(output.staging_dir, sub_location) filename_unzip = make_unzip_single_file(filename) gzip_filename = os.path.join( create_output_dir(os.path.join(output.prod_dir, sub_location.path)), sub_location.output_filename) make_gzip(filename_unzip, gzip_filename)
def extract_ensembl(self, ensembl, output, cmd): logger.info("Converting Ensembl json file into jsonl.") jq_cmd = Utils.check_path_command("jq", cmd.jq) resource_stage = Dict() resource_stage.uri = ensembl.uri.replace('{release}', str(ensembl.release)) file_input = Downloads.dowload_staging_ftp(output.staging_dir, resource_stage) output_dir = os.path.join(output.prod_dir, ensembl.path) output_file = os.path.join(create_output_dir(output_dir), ensembl.output_filename) with open(output_file, "wb") as jsonwrite: jqp = subprocess.Popen([jq_cmd, "-c", ensembl.jq, file_input], stdout=subprocess.PIPE) jsonwrite.write(jqp.stdout.read())
def owl_to_json(self, filename_input, output_dir, resource, riot): file_ouput_path = output_dir + "/" + resource.path create_output_dir(file_ouput_path) return riot.convert_owl_to_jsonld(filename_input, file_ouput_path, resource.owl_jq)
def download_indices(self, conf, output): output_dir = create_output_dir(output.prod_dir+"/" + conf.etl.chembl.path) es_files_written = self._handle_elasticsearch(conf.etl.chembl, output_dir) return es_files_written