def run_loader(self): """Main function for running loader""" if self.args.verbose: self.logger.warn('DEBUG mode enabled!') time.sleep(3) data_manager = DataFileManager(self.context_info.config_file_location) file_transactor = FileTransactor() file_transactor.start_threads( data_manager.get_file_transactor_thread_settings()) data_manager.download_and_validate() self.logger.debug("finished downloading now doing thread") file_transactor.check_for_thread_errors() self.logger.debug("finished threads waiting for queues") file_transactor.wait_for_queues() self.logger.debug("finished queues waiting for shutdown") file_transactor.shutdown() neo_transactor = Neo4jTransactor() neo_transactor.start_threads( data_manager.get_neo_transactor_thread_settings()) self.logger.debug("finished starting neo threads ") if not self.context_info.env["USING_PICKLE"]: self.logger.info("Creating indices.") Neo4jHelper.create_indices() etl_time_tracker_list = self.run_etl_groups(self.logger, data_manager, neo_transactor) neo_transactor.shutdown() elapsed_time = time.time() - self.start_time for time_item in etl_time_tracker_list: self.logger.info(time_item) self.logger.info('Loader finished. Elapsed time: %s' % time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
def _load_and_process_data(self): # create gene descriptions data manager and load common data context_info = ContextInfo() data_manager = DataFileManager(context_info.config_file_location) #go_onto_config = data_manager.get_config('GO') go_annot_config = data_manager.get_config('GAF') #do_onto_config = data_manager.get_config('DOID') go_annot_sub_dict = {sub.get_data_provider(): sub for sub in go_annot_config.get_sub_type_objects()} this_dir = os.path.split(__file__)[0] gd_config = GenedescConfigParser(os.path.join(this_dir, os.pardir, os.pardir, "gene_descriptions.yml")) gd_data_manager = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"]) gd_data_manager.set_ontology(ontology_type=DataType.GO, ontology=self.get_ontology(data_type=DataType.GO), config=gd_config) gd_data_manager.set_ontology(ontology_type=DataType.DO, ontology=self.get_ontology(data_type=DataType.DO), config=gd_config) # generate descriptions for each MOD for prvdr in [sub_type.get_data_provider().upper() \ for sub_type in self.data_type_config.get_sub_type_objects()]: gd_config_mod_specific = copy.deepcopy(gd_config) if prvdr == "WB": gd_config_mod_specific.config["expression_sentences_options"][ "remove_children_if_parent_is_present"] = True self.logger.info("Generating gene descriptions for %s", prvdr) data_provider = prvdr if prvdr != "HUMAN" else "RGD" json_desc_writer = DescriptionsWriter() go_annot_path = "file://" + os.path.join(os.getcwd(), "tmp", go_annot_sub_dict[prvdr].file_to_download) gd_data_manager.load_associations_from_file( associations_type=DataType.GO, associations_url=go_annot_path, associations_cache_path=os.path.join(os.getcwd(), "tmp", "gd_cache", "go_annot_" + prvdr + ".gaf"), config=gd_config_mod_specific) gd_data_manager.set_associations(associations_type=DataType.DO, associations=self.get_disease_annotations_from_db( data_provider=data_provider, gd_data_manager=gd_data_manager, logger=self.logger), config=gd_config_mod_specific) if prvdr in EXPRESSION_PRVD_SUBTYPE_MAP: gd_data_manager.set_ontology(ontology_type=DataType.EXPR, ontology=self.get_ontology(data_type=DataType.EXPR, provider=prvdr), config=gd_config_mod_specific) gd_data_manager.set_associations( associations_type=DataType.EXPR, associations=self.get_expression_annotations_from_db(data_provider=data_provider, gd_data_manager=gd_data_manager, logger=self.logger), config=gd_config_mod_specific) commit_size = self.data_type_config.get_neo4j_commit_size() generators = self.get_generators(prvdr, gd_data_manager, gd_config_mod_specific, json_desc_writer) query_template_list = [ [self.gene_descriptions_query_template, commit_size, "genedescriptions_data_" + prvdr + ".csv"] ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.save_descriptions_report_files(data_provider=prvdr, json_desc_writer=json_desc_writer, context_info=context_info, gd_data_manager=gd_data_manager)
def get_generators(self, variant_data, batch_size): # noqa """Get Generators.""" data_providers = [] release = "" variants = [] variant_genomic_locations = [] variant_so_terms = [] cross_references = [] counter = 0 date_produced = variant_data['metaData']['dateProduced'] self.data_providers_process(variant_data) load_key = date_produced + self.data_provider + "_VARIATION" if 'release' in variant_data['metaData']: release = variant_data['metaData']['release'] assemblies = {} for allele_record in variant_data['data']: chromosome = allele_record["chromosome"] if chromosome.startswith("chr"): chromosome_str = chromosome[3:] else: chromosome_str = chromosome assembly = allele_record["assembly"] if assembly not in assemblies: self.logger.info(assembly) context_info = ContextInfo() data_manager = DataFileManager( context_info.config_file_location) assemblies[assembly] = AssemblySequenceHelper( assembly, data_manager) so_term_id = allele_record.get('type') genomic_reference_sequence = allele_record.get( 'genomicReferenceSequence') genomic_variant_sequence = allele_record.get( 'genomicVariantSequence') if genomic_reference_sequence == 'N/A': genomic_reference_sequence = "" if genomic_variant_sequence == 'N/A': genomic_variant_sequence = "" padding_left = "" padding_right = "" if allele_record.get('start') != "" and allele_record.get( 'end') != "": # not insertion if so_term_id != "SO:0000667" and chromosome_str != "Unmapped_Scaffold_8_D1580_D1567": genomic_reference_sequence = assemblies[ assembly].get_sequence(chromosome_str, allele_record.get('start'), allele_record.get('end')) if allele_record.get('start') < allele_record.get('end'): start = allele_record.get('start') end = allele_record.get('end') else: start = allele_record.get('end') end = allele_record.get('start') padding_width = 500 if so_term_id != "SO:0000667": # not insertion start = start - 1 end = end + 1 left_padding_start = start - padding_width if left_padding_start < 1: left_padding_start = 1 padding_left = assemblies[assembly].get_sequence( chromosome_str, left_padding_start, start) right_padding_end = end + padding_width padding_right = assemblies[assembly].get_sequence( chromosome_str, end, right_padding_end) counter = counter + 1 global_id = allele_record.get('alleleId') mod_global_cross_ref_id = "" cross_references = [] if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( global_id) if is_it_test_entry is False: counter = counter - 1 continue cross_ref_primary_id = allele_record.get( 'sequenceOfReferenceAccessionNumber') local_cross_ref_id = cross_ref_primary_id.split(":")[1] prefix = cross_ref_primary_id.split(":")[0] cross_ref_complete_url = self.etlh.get_no_page_complete_url( local_cross_ref_id, prefix, global_id) xref_map = ETLHelper.get_xref_dict( local_cross_ref_id, prefix, "variant_sequence_of_reference", "sequence_of_reference_accession_number", global_id, cross_ref_complete_url, cross_ref_primary_id + "variant_sequence_of_reference") xref_map['dataId'] = global_id if cross_ref_primary_id is not None: cross_references.append(xref_map) if genomic_reference_sequence is not None: if len(genomic_reference_sequence) > 1000 and ( allele_record.get('type') == 'SO:1000002' or allele_record.get('type') == 'SO:1000008'): self.logger.debug("%s genomicReferenceSequence", allele_record.get('alleleId')) if genomic_variant_sequence is not None: if len(genomic_variant_sequence) > 1000 and ( allele_record.get('type') in ['SO:1000002', 'SO:1000008']): self.logger.debug("%s genomicVariantSequence", allele_record.get('alleleId')) hgvs_nomenclature, hgvs_synonym = self.get_hgvs_nomenclature( allele_record.get('sequenceOfReferenceAccessionNumber'), allele_record.get('type'), allele_record.get('start'), allele_record.get('end'), genomic_reference_sequence, genomic_variant_sequence, allele_record.get('assembly'), chromosome_str) if (genomic_reference_sequence is not None and len(genomic_reference_sequence) > 30000) \ or (genomic_variant_sequence is not None and len(genomic_variant_sequence)) > 30000: self.logger.debug( "%s has too long of a sequence potentionally", allele_record.get('alleleId')) # TODO: fix typo in MGI Submission for this variant so # that it doesn't list a 40K bp point mutation. if allele_record.get('alleleId') != 'MGI:6113870': variant_dataset = { "hgvs_nomenclature": hgvs_nomenclature, "genomicReferenceSequence": genomic_reference_sequence, "genomicVariantSequence": genomic_variant_sequence, "paddingLeft": padding_left, "paddingRight": padding_right, "alleleId": allele_record.get('alleleId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": load_key, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "dataProvider": self.data_provider, "variantHGVSSynonym": hgvs_synonym } variant_genomic_location_dataset = { "variantId": hgvs_nomenclature, "assembly": allele_record.get('assembly'), "chromosome": chromosome_str, "start": allele_record.get('start'), "end": allele_record.get('end'), "uuid": str(uuid.uuid4()), "dataProvider": self.data_provider } variant_so_term = { "variantId": hgvs_nomenclature, "soTermId": allele_record.get('type') } variant_so_terms.append(variant_so_term) variant_genomic_locations.append( variant_genomic_location_dataset) variants.append(variant_dataset) if counter == batch_size: yield [ variants, variant_genomic_locations, variant_so_terms, cross_references ] variants = [] variant_genomic_locations = [] variant_so_terms = [] cross_references = [] if counter > 0: yield [ variants, variant_genomic_locations, variant_so_terms, cross_references ]
def run_preprocessor(self): if args.verbose: logger.warn('DEBUG mode enabled!') time.sleep(3) start_time = time.time() data_manager = DataFileManager(context_info.config_file_location) logger.info("config_file_location %s" % (context_info.config_file_location)) ft = FileTransactor() ft.start_threads(data_manager.get_FT_thread_settings()) data_manager.download_and_validate() logger.info("finished downloading now doing thread") ft.check_for_thread_errors() logger.info("finished threads waiting for queues") ft.wait_for_queues() logger.info("finished queues waiting for shutdown") ft.shutdown() configs_dict = { 'INTERACTION-SOURCE-MOL': ['INTERACTION-SOURCE', 'BGI'], 'INTERACTION-SOURCE-GEN': ['INTERACTION-SOURCE', 'BGI'] } config_dict = { 'INTERACTION-SOURCE-MOL': 'INTERACTION-SOURCE', 'INTERACTION-SOURCE-GEN': 'INTERACTION-SOURCE' } processor_dispatch = { 'INTERACTION-SOURCE-MOL': InteractionMolecularProcessor, 'INTERACTION-SOURCE-GEN': InteractionGeneticProcessor } list_of_processor_groups = [[ 'INTERACTION-SOURCE-MOL', 'INTERACTION-SOURCE-GEN' ]] processor_time_tracker_list = [] for processor_group in list_of_processor_groups: processor_group_start_time = time.time() logger.info("Starting Processor group: %s" % processor_group) thread_pool = [] for processor_name in processor_group: logger.info("Processor Name: %s" % processor_name) configs = [] for config_type in configs_dict[processor_name]: config = data_manager.get_config(config_type) if config is not None: configs.append(config) else: logger.info("No Config found for: %s %s" % (processor_name, config_type)) if len(configs) > 0: processor = processor_dispatch[processor_name](configs) p = multiprocessing.Process(target=processor.run_processor) p.start() thread_pool.append(p) else: logger.info("No Configs found for: %s" % processor_name) Processor.wait_for_threads(thread_pool) logger.info("Waiting for Queues to sync up") processor_elapsed_time = time.time() - processor_group_start_time processor_time_message = ( "Finished Processor group: %s, Elapsed time: %s" % (processor_group, time.strftime("%H:%M:%S", time.gmtime(processor_elapsed_time)))) logger.info(processor_time_message) processor_time_tracker_list.append(processor_time_message) end_time = time.time() elapsed_time = end_time - start_time for time_item in processor_time_tracker_list: logger.info(time_item) logger.info('PreProcess finished. Elapsed time: %s' % time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))