def _load_and_process_data(self): # create gene descriptions data manager and load common data context_info = ContextInfo() data_manager = DataFileManager(context_info.config_file_location) #go_onto_config = data_manager.get_config('GO') go_annot_config = data_manager.get_config('GAF') #do_onto_config = data_manager.get_config('DOID') go_annot_sub_dict = {sub.get_data_provider(): sub for sub in go_annot_config.get_sub_type_objects()} this_dir = os.path.split(__file__)[0] gd_config = GenedescConfigParser(os.path.join(this_dir, os.pardir, os.pardir, "gene_descriptions.yml")) gd_data_manager = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"]) gd_data_manager.set_ontology(ontology_type=DataType.GO, ontology=self.get_ontology(data_type=DataType.GO), config=gd_config) gd_data_manager.set_ontology(ontology_type=DataType.DO, ontology=self.get_ontology(data_type=DataType.DO), config=gd_config) # generate descriptions for each MOD for prvdr in [sub_type.get_data_provider().upper() \ for sub_type in self.data_type_config.get_sub_type_objects()]: gd_config_mod_specific = copy.deepcopy(gd_config) if prvdr == "WB": gd_config_mod_specific.config["expression_sentences_options"][ "remove_children_if_parent_is_present"] = True self.logger.info("Generating gene descriptions for %s", prvdr) data_provider = prvdr if prvdr != "HUMAN" else "RGD" json_desc_writer = DescriptionsWriter() go_annot_path = "file://" + os.path.join(os.getcwd(), "tmp", go_annot_sub_dict[prvdr].file_to_download) gd_data_manager.load_associations_from_file( associations_type=DataType.GO, associations_url=go_annot_path, associations_cache_path=os.path.join(os.getcwd(), "tmp", "gd_cache", "go_annot_" + prvdr + ".gaf"), config=gd_config_mod_specific) gd_data_manager.set_associations(associations_type=DataType.DO, associations=self.get_disease_annotations_from_db( data_provider=data_provider, gd_data_manager=gd_data_manager, logger=self.logger), config=gd_config_mod_specific) if prvdr in EXPRESSION_PRVD_SUBTYPE_MAP: gd_data_manager.set_ontology(ontology_type=DataType.EXPR, ontology=self.get_ontology(data_type=DataType.EXPR, provider=prvdr), config=gd_config_mod_specific) gd_data_manager.set_associations( associations_type=DataType.EXPR, associations=self.get_expression_annotations_from_db(data_provider=data_provider, gd_data_manager=gd_data_manager, logger=self.logger), config=gd_config_mod_specific) commit_size = self.data_type_config.get_neo4j_commit_size() generators = self.get_generators(prvdr, gd_data_manager, gd_config_mod_specific, json_desc_writer) query_template_list = [ [self.gene_descriptions_query_template, commit_size, "genedescriptions_data_" + prvdr + ".csv"] ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.save_descriptions_report_files(data_provider=prvdr, json_desc_writer=json_desc_writer, context_info=context_info, gd_data_manager=gd_data_manager)
def run_preprocessor(self): if args.verbose: logger.warn('DEBUG mode enabled!') time.sleep(3) start_time = time.time() data_manager = DataFileManager(context_info.config_file_location) logger.info("config_file_location %s" % (context_info.config_file_location)) ft = FileTransactor() ft.start_threads(data_manager.get_FT_thread_settings()) data_manager.download_and_validate() logger.info("finished downloading now doing thread") ft.check_for_thread_errors() logger.info("finished threads waiting for queues") ft.wait_for_queues() logger.info("finished queues waiting for shutdown") ft.shutdown() configs_dict = { 'INTERACTION-SOURCE-MOL': ['INTERACTION-SOURCE', 'BGI'], 'INTERACTION-SOURCE-GEN': ['INTERACTION-SOURCE', 'BGI'] } config_dict = { 'INTERACTION-SOURCE-MOL': 'INTERACTION-SOURCE', 'INTERACTION-SOURCE-GEN': 'INTERACTION-SOURCE' } processor_dispatch = { 'INTERACTION-SOURCE-MOL': InteractionMolecularProcessor, 'INTERACTION-SOURCE-GEN': InteractionGeneticProcessor } list_of_processor_groups = [[ 'INTERACTION-SOURCE-MOL', 'INTERACTION-SOURCE-GEN' ]] processor_time_tracker_list = [] for processor_group in list_of_processor_groups: processor_group_start_time = time.time() logger.info("Starting Processor group: %s" % processor_group) thread_pool = [] for processor_name in processor_group: logger.info("Processor Name: %s" % processor_name) configs = [] for config_type in configs_dict[processor_name]: config = data_manager.get_config(config_type) if config is not None: configs.append(config) else: logger.info("No Config found for: %s %s" % (processor_name, config_type)) if len(configs) > 0: processor = processor_dispatch[processor_name](configs) p = multiprocessing.Process(target=processor.run_processor) p.start() thread_pool.append(p) else: logger.info("No Configs found for: %s" % processor_name) Processor.wait_for_threads(thread_pool) logger.info("Waiting for Queues to sync up") processor_elapsed_time = time.time() - processor_group_start_time processor_time_message = ( "Finished Processor group: %s, Elapsed time: %s" % (processor_group, time.strftime("%H:%M:%S", time.gmtime(processor_elapsed_time)))) logger.info(processor_time_message) processor_time_tracker_list.append(processor_time_message) end_time = time.time() elapsed_time = end_time - start_time for time_item in processor_time_tracker_list: logger.info(time_item) logger.info('PreProcess finished. Elapsed time: %s' % time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))