def _process_sub_type(self, sub_type): self.logger.info("Loading Phenotype Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() data = JSONFile().get_data(filepath) self.logger.info("Finished Loading Phenotype Data: %s", sub_type.get_data_provider()) if data is None: self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider()) return commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_neo4j_commit_size() data_provider = sub_type.get_data_provider() self.logger.info("subtype: " + data_provider) query_template_list = [ [self.execute_gene_query_template, commit_size, "phenotype_gene_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_allele_query_template, commit_size, "phenotype_allele_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_agm_query_template, commit_size, "phenotype_agm_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_pges_allele_query_template, commit_size, "phenotype_pges_allele_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_pges_agm_query_template, commit_size, "phenotype_pges_agm_data_" + sub_type.get_data_provider() + ".csv"] ] # Obtain the generator generators = self.get_generators(data, batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("Phenotype-{}: ".format(sub_type.get_data_provider()))
def load_genes_mod(self, batch_size, testObject, bgiName, loadFile): path = "tmp" S3File("mod-datadumps", loadFile, path).download() TARFile(path, loadFile).extract_all() gene_data = JSONFile().get_data(path + bgiName, 'BGI') gene_lists = BGIExt().get_data(gene_data, batch_size, testObject) return self.yield_gene_lists(gene_lists)
def _process_sub_type(self, sub_type): self.logger.info("Loading Sequence Targeting Reagent Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() self.logger.info(filepath) data = JSONFile().get_data(filepath) self.logger.info( "Finished Loading Sequence Targeting Reagent Data: %s", sub_type.get_data_provider()) if data is None: self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [ self.agm_query_template, commit_size, "agm_data_" + sub_type.get_data_provider() + ".csv" ], [ self.agm_secondary_ids_query_template, commit_size, "agm_secondary_ids_" + sub_type.get_data_provider() + ".csv" ], [ self.agm_synonyms_query_template, commit_size, "agm_synonyms_" + sub_type.get_data_provider() + ".csv" ], [ self.agm_components_query_template, commit_size, "agm_components_" + sub_type.get_data_provider() + ".csv" ], [ self.agm_sqtrs_query_template, commit_size, "agm_sqtrs_" + sub_type.get_data_provider() + ".csv" ], [ self.agm_backgrounds_query_template, commit_size, "agm_backgrounds_" + sub_type.get_data_provider() + ".csv" ] ] # Obtain the generator generators = self.get_generators(data, sub_type.get_data_provider(), batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("AGM-{}: ".format(sub_type.get_data_provider()))
def load_allele_objects_mod(self, batch_size, testObject, alleleName, loadFile): path = "tmp" S3File("mod-datadumps", loadFile, path).download() TARFile(path, loadFile).extract_all() alleleData = JSONFile().get_data(path + alleleName, 'allele') alleleDict = AlleleExt().get_alleles(alleleData, batch_size, testObject) return alleleDict
def load_disease_allele_objects_mod(self, batch_size, testObject, diseaseName, loadFile, graph): path = "tmp" S3File("mod-datadumps", loadFile, path).download() TARFile(path, loadFile).extract_all() disease_data = JSONFile().get_data(path + diseaseName, 'disease') disease_dict = DiseaseAlleleExt().get_allele_disease_data( disease_data, batch_size, graph) return disease_dict
def _process_sub_type(self, sub_type): logger.info("Loading HTP metadata Data: %s" % sub_type.get_data_provider()) filepath = sub_type.get_filepath() logger.info(filepath) data = JSONFile().get_data(filepath) logger.info("Finished Loading HTP metadata Data: %s" % sub_type.get_data_provider()) if data is None: logger.warn("No Data found for %s skipping" % sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_list = [ [ HTPMetaDatasetETL.htp_dataset_query_template, commit_size, "htp_metadataset_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetETL.htp_category_tags_query_template, commit_size, "htp_metadataset_tags_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetETL.htp_dataset_pub_query_template, commit_size, "htp_metadataset_publications_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetETL.htpdataset_xrefs_template, commit_size, "htp_metadataset_xrefs_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetETL.htp_secondaryIds_query_template, commit_size, "htp_metadataset_secondaryIds_" + sub_type.get_data_provider() + ".csv" ], ] # Obtain the generator generators = self.get_generators(data, batch_size) query_and_file_list = self.process_query_params(query_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def _process_sub_type(self, sub_type): self.logger.info("Loading Variation Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() data = JSONFile().get_data(filepath) self.logger.info("Finished Loading Variation Data: %s", sub_type.get_data_provider()) if data is None: self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [ self.variation_query_template, commit_size, "variation_data_" + sub_type.get_data_provider() + ".csv" ], [ self.genomic_locations_query_template, commit_size, "variant_genomiclocations_" + sub_type.get_data_provider() + ".csv" ], [ self.so_terms_query_template, commit_size, "variant_so_terms_" + sub_type.get_data_provider() + ".csv" ], [ self.xrefs_query_template, commit_size, "variant_xrefs_" + sub_type.get_data_provider() + ".csv" ] ] generators = self.get_generators(data, batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("Var-{}: ".format(sub_type.get_data_provider()))
def _process_sub_type(self, sub_type): self.logger.info("Loading Disease Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() data = JSONFile().get_data(filepath) self.logger.info("Finished Loading Disease Data: %s", sub_type.get_data_provider()) if data is None: self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider()) return commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [self.execute_allele_query_template, commit_size, "disease_allele_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_gene_query_template, commit_size, "disease_gene_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_agms_query_template, commit_size, "disease_agms_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_pges_gene_query_template, commit_size, "disease_pges_gene_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_pges_allele_query_template, commit_size, "disease_pges_allele_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_pges_agm_query_template, commit_size, "disease_pges_agms_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_withs_query_template, commit_size, "disease_withs_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_ecode_query_template, commit_size, "disease_evidence_code_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_annotation_xrefs_query_template, commit_size, "disease_annotation_xrefs_data_" + sub_type.get_data_provider() + ".csv"] ] # Obtain the generator generators = self.get_generators(data, batch_size, sub_type.get_data_provider()) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("Disease-{}: ".format(sub_type.get_data_provider())) self.logger.info("Finished Loading Disease Data: %s", sub_type.get_data_provider())
def _process_sub_type(self, sub_type): logger.info("Loading HTP metadata sample data: %s" % sub_type.get_data_provider()) filepath = sub_type.get_filepath() logger.info(filepath) data = JSONFile().get_data(filepath) logger.info("Finished Loading HTP metadata sample data: %s" % sub_type.get_data_provider()) if data is None: logger.warn("No Data found for %s skipping" % sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_list = [ [ HTPMetaDatasetSampleETL.htp_dataset_sample_query_template, commit_size, "htp_metadataset_sample_samples_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL. htp_bio_entity_expression_query_template, commit_size, "htp_metadataset_sample_bioentities_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.htp_secondaryIds_query_template, commit_size, "htp_metadataset_sample_secondaryIds_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.htp_dataset_join_query_template, commit_size, "htp_metadataset_sample_datasets_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.htp_stages_query_template, commit_size, "htp_metadataset_sample_stages_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ao_terms_query_template, commit_size, "htp_metadataset_sample_aoterms_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ao_substructures_query_template, commit_size, "htp_metadataset_sample_ao_substructures_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ao_qualifiers_query_template, commit_size, "htp_metadataset_sample_ao_qualifiers_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ao_ss_qualifiers_query_template, commit_size, "htp_metadataset_sample_ao_ss_qualifiers_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.cc_term_query_template, commit_size, "htp_metadataset_sample_ccterms" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ccq_expression_query_template, commit_size, "htp_metadataset_sample_ccqterms_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.uberon_ao_query_template, commit_size, "htp_metadataset_sample_uberon_ao_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.uberon_ao_other_query_template, commit_size, "htp_metadataset_sample_uberon_ao_other_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.htp_dataset_sample_agm_query_template, commit_size, "htp_metadataset_sample_agms_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL. htp_dataset_sample_agmtext_query_template, commit_size, "htp_metadataset_sample_agmstext_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL. htp_dataset_sample_assemblies_query_template, commit_size, "htp_metadataset_sample_assemblies_" + sub_type.get_data_provider() + ".csv" ] ] # Obtain the generator generators = self.get_generators(data, batch_size) query_and_file_list = self.process_query_params(query_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def _process_sub_type(self, sub_type, query_tracking_list): self.logger.info("Loading BGI Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() if filepath is None: self.logger.error("Can't find input file for %s", sub_type) sys.exit() data = JSONFile().get_data(filepath) # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # gene_metadata, gene_dataset, secondary_ids, genomic_locations, cross_references, synonyms # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [ self.gene_metadata_query_template, commit_size, "gene_metadata_" + sub_type.get_data_provider() + ".csv" ], [ self.gene_query_template, commit_size, "gene_data_" + sub_type.get_data_provider() + ".csv" ], [ self.basic_gene_load_relations_query_template, commit_size, "gene_data_load_" + sub_type.get_data_provider() + ".csv" ], [ self.basic_gene_species_relations_query_template, commit_size, "gene_data_species_" + sub_type.get_data_provider() + ".csv" ], [ self.so_terms_query_template, commit_size, "gene_so_terms_" + sub_type.get_data_provider() + ".csv" ], [ self.chromosomes_query_template, commit_size, "gene_chromosomes_" + sub_type.get_data_provider() + ".csv" ], [ self.gene_secondary_ids_query_template, commit_size, "gene_secondary_ids_" + sub_type.get_data_provider() + ".csv" ], [ self.genomic_locations_query_template, commit_size, "gene_genomic_locations_" + sub_type.get_data_provider() + ".csv" ], [ self.xrefs_query_template, commit_size, "gene_cross_references_" + sub_type.get_data_provider() + ".csv" ], [ self.xrefs_relationships_query_template, commit_size, "gene_cross_references_relationships_" + sub_type.get_data_provider() + ".csv" ], [ self.gene_synonyms_query_template, 600000, "gene_synonyms_" + sub_type.get_data_provider() + ".csv" ] ] # Obtain the generator generators = self.get_generators(data, sub_type.get_data_provider(), batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) for item in query_and_file_list: query_tracking_list.append(item) self.error_messages("BGI-{}: ".format(sub_type.get_data_provider())) self.logger.info("Finished Loading BGI Data: %s", sub_type.get_data_provider())
def _process_sub_type(self, sub_type): self.logger.info("Loading Construct Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() data = JSONFile().get_data(filepath) self.logger.info("Finished Loading Construct Data: %s", sub_type.get_data_provider()) if data is None: self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [ ConstructETL.construct_query_template, commit_size, "Construct_data_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_secondary_ids_query_template, commit_size, "Construct_secondary_ids_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_synonyms_query_template, commit_size, "Construct_synonyms_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_xrefs_query_template, commit_size, "Construct_xrefs_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.non_bgi_component_query_template, commit_size, "Construct_non_bgi_component_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_gene_component_query_template, commit_size, "Construct_components_gene" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_no_gene_component_query_template, commit_size, "Construct_components_no_gene" + sub_type.get_data_provider() + ".csv" ] ] # Obtain the generator generators = self.get_generators(data, sub_type.get_data_provider(), batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def __init__(self, config_file_loc): self.context_info = ContextInfo() # Load config yaml. logger.debug('Loading config file: %s' % config_file_loc) config_file = open(config_file_loc, 'r') self.config_data = yaml.load(config_file, Loader=yaml.SafeLoader) logger.debug("Config Data: %s" % self.config_data) # Load validation yaml. validation_yaml_file_loc = os.path.abspath('src/config/validation.yml') logger.debug('Loading validation schema: %s' % validation_yaml_file_loc) validation_schema_file = open(validation_yaml_file_loc, 'r') self.validation_schema = yaml.load(validation_schema_file, Loader=yaml.SafeLoader) # Assign values for thread counts. self.FileTransactorThreads = self.config_data['FileTransactorThreads'] # Loading a JSON blurb from a file as a placeholder for submission system query. other_file_meta_data = os.path.abspath( 'src/config/local_submission.json') self.non_submission_system_data = JSONFile().get_data( other_file_meta_data) urllib3.disable_warnings() self.http = urllib3.PoolManager() # use the recently created snapshot api_url = self.context_info.env[ "FMS_API_URL"] + '/api/snapshot/release/' + self.context_info.env[ "ALLIANCE_RELEASE"] logger.info(api_url) submission_data = self.http.request('GET', api_url) if submission_data.status != 200: logger.error("Status: %s" % submission_data.status) logger.error("No Data came from API: %s" % api_url) sys.exit(-1) self.snapshot_submission_system_data = json.loads( submission_data.data.decode('UTF-8')) logger.debug(self.snapshot_submission_system_data) for dataFile in self.non_submission_system_data['snapShot'][ 'dataFiles']: self.snapshot_submission_system_data['snapShot'][ 'dataFiles'].append(dataFile) logger.debug(self.snapshot_submission_system_data) # List used for MOD and data type objects. self.master_data_dictionary = {} # Dictionary for transformed submission system data. self.transformed_submission_system_data = {} # process config file during initialization self.process_config()
def _process_sub_type(self, sub_type): logger.info("Loading Allele Data: %s" % sub_type.get_data_provider()) filepath = sub_type.get_filepath() logger.info(filepath) data = JSONFile().get_data(filepath) if data is None: logger.warn("No Data found for %s skipping" % sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_list = [ [ AlleleETL.allele_gene_no_construct_query_template, commit_size, "allele_gene_no_construct_data_" + sub_type.get_data_provider() + ".csv" ], [ AlleleETL.allele_construct_gene_query_template, commit_size, "allele_construct_gene_data_" + sub_type.get_data_provider() + ".csv" ], [ AlleleETL.allele_construct_no_gene_query_template, commit_size, "allele_construct_no_gene_data_" + sub_type.get_data_provider() + ".csv" ], [ AlleleETL.allele_no_gene_no_construct_query_template, commit_size, "allele_no_gene_no_construct_data_" + sub_type.get_data_provider() + ".csv" ], [ AlleleETL.allele_secondaryids_template, commit_size, "allele_secondaryids_" + sub_type.get_data_provider() + ".csv" ], [ AlleleETL.allele_synonyms_template, commit_size, "allele_synonyms_" + sub_type.get_data_provider() + ".csv" ], [ AlleleETL.allele_xrefs_template, commit_size, "allele_xrefs_" + sub_type.get_data_provider() + ".csv" ], ] # Obtain the generator generators = self.get_generators(data, batch_size) query_and_file_list = self.process_query_params(query_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("Allele-{}: ".format(sub_type.get_data_provider())) logger.info("Finished Loading Allele Data: %s" % sub_type.get_data_provider())