def _load_and_process_data(self):
        # create gene descriptions data manager and load common data
        context_info = ContextInfo()
        data_manager = DataFileManager(context_info.config_file_location)
        #go_onto_config = data_manager.get_config('GO')
        go_annot_config = data_manager.get_config('GAF')
        #do_onto_config = data_manager.get_config('DOID')
        go_annot_sub_dict = {sub.get_data_provider(): sub for sub in go_annot_config.get_sub_type_objects()}
        this_dir = os.path.split(__file__)[0]
        gd_config = GenedescConfigParser(os.path.join(this_dir,
                                                      os.pardir,
                                                      os.pardir,
                                                      "gene_descriptions.yml"))
        gd_data_manager = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"])
        gd_data_manager.set_ontology(ontology_type=DataType.GO,
                                     ontology=self.get_ontology(data_type=DataType.GO),
                                     config=gd_config)
        gd_data_manager.set_ontology(ontology_type=DataType.DO,
                                     ontology=self.get_ontology(data_type=DataType.DO),
                                     config=gd_config)
        # generate descriptions for each MOD
        for prvdr in [sub_type.get_data_provider().upper() \
                      for sub_type in self.data_type_config.get_sub_type_objects()]:
            gd_config_mod_specific = copy.deepcopy(gd_config)
            if prvdr == "WB":
                gd_config_mod_specific.config["expression_sentences_options"][
                    "remove_children_if_parent_is_present"] = True
            self.logger.info("Generating gene descriptions for %s", prvdr)
            data_provider = prvdr if prvdr != "HUMAN" else "RGD"
            json_desc_writer = DescriptionsWriter()
            go_annot_path = "file://" + os.path.join(os.getcwd(),
                                                     "tmp",
                                                     go_annot_sub_dict[prvdr].file_to_download)
            gd_data_manager.load_associations_from_file(
                associations_type=DataType.GO, associations_url=go_annot_path,
                associations_cache_path=os.path.join(os.getcwd(),
                                                     "tmp",
                                                     "gd_cache",
                                                     "go_annot_" + prvdr + ".gaf"),
                config=gd_config_mod_specific)
            gd_data_manager.set_associations(associations_type=DataType.DO,
                                             associations=self.get_disease_annotations_from_db(
                                                 data_provider=data_provider,
                                                 gd_data_manager=gd_data_manager,
                                                 logger=self.logger),
                                             config=gd_config_mod_specific)
            if prvdr in EXPRESSION_PRVD_SUBTYPE_MAP:
                gd_data_manager.set_ontology(ontology_type=DataType.EXPR,
                                             ontology=self.get_ontology(data_type=DataType.EXPR,
                                                                        provider=prvdr),
                                             config=gd_config_mod_specific)
                gd_data_manager.set_associations(
                    associations_type=DataType.EXPR,
                    associations=self.get_expression_annotations_from_db(data_provider=data_provider,
                                                                         gd_data_manager=gd_data_manager,
                                                                         logger=self.logger),
                    config=gd_config_mod_specific)
            commit_size = self.data_type_config.get_neo4j_commit_size()
            generators = self.get_generators(prvdr,
                                             gd_data_manager,
                                             gd_config_mod_specific,
                                             json_desc_writer)
            query_template_list = [
                [self.gene_descriptions_query_template, commit_size,
                 "genedescriptions_data_" + prvdr + ".csv"]
            ]

            query_and_file_list = self.process_query_params(query_template_list)
            CSVTransactor.save_file_static(generators, query_and_file_list)
            Neo4jTransactor.execute_query_batch(query_and_file_list)
            self.save_descriptions_report_files(data_provider=prvdr,
                                                json_desc_writer=json_desc_writer,
                                                context_info=context_info,
                                                gd_data_manager=gd_data_manager)
Example #2
0
    def run_preprocessor(self):

        if args.verbose:
            logger.warn('DEBUG mode enabled!')
            time.sleep(3)

        start_time = time.time()
        data_manager = DataFileManager(context_info.config_file_location)
        logger.info("config_file_location %s" %
                    (context_info.config_file_location))

        ft = FileTransactor()

        ft.start_threads(data_manager.get_FT_thread_settings())
        data_manager.download_and_validate()
        logger.info("finished downloading now doing thread")
        ft.check_for_thread_errors()
        logger.info("finished threads waiting for queues")
        ft.wait_for_queues()

        logger.info("finished queues waiting for shutdown")
        ft.shutdown()

        configs_dict = {
            'INTERACTION-SOURCE-MOL': ['INTERACTION-SOURCE', 'BGI'],
            'INTERACTION-SOURCE-GEN': ['INTERACTION-SOURCE', 'BGI']
        }

        config_dict = {
            'INTERACTION-SOURCE-MOL': 'INTERACTION-SOURCE',
            'INTERACTION-SOURCE-GEN': 'INTERACTION-SOURCE'
        }

        processor_dispatch = {
            'INTERACTION-SOURCE-MOL': InteractionMolecularProcessor,
            'INTERACTION-SOURCE-GEN': InteractionGeneticProcessor
        }

        list_of_processor_groups = [[
            'INTERACTION-SOURCE-MOL', 'INTERACTION-SOURCE-GEN'
        ]]

        processor_time_tracker_list = []

        for processor_group in list_of_processor_groups:
            processor_group_start_time = time.time()
            logger.info("Starting Processor group: %s" % processor_group)
            thread_pool = []
            for processor_name in processor_group:
                logger.info("Processor Name: %s" % processor_name)

                configs = []
                for config_type in configs_dict[processor_name]:
                    config = data_manager.get_config(config_type)
                    if config is not None:
                        configs.append(config)
                    else:
                        logger.info("No Config found for: %s %s" %
                                    (processor_name, config_type))

                if len(configs) > 0:
                    processor = processor_dispatch[processor_name](configs)
                    p = multiprocessing.Process(target=processor.run_processor)
                    p.start()
                    thread_pool.append(p)
                else:
                    logger.info("No Configs found for: %s" % processor_name)

            Processor.wait_for_threads(thread_pool)

            logger.info("Waiting for Queues to sync up")
            processor_elapsed_time = time.time() - processor_group_start_time
            processor_time_message = (
                "Finished Processor group: %s, Elapsed time: %s" %
                (processor_group,
                 time.strftime("%H:%M:%S",
                               time.gmtime(processor_elapsed_time))))
            logger.info(processor_time_message)
            processor_time_tracker_list.append(processor_time_message)

        end_time = time.time()
        elapsed_time = end_time - start_time

        for time_item in processor_time_tracker_list:
            logger.info(time_item)
        logger.info('PreProcess finished. Elapsed time: %s' %
                    time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))