Beispiel #1
0
    def run_loader(self):
        """Main function for running loader"""

        if self.args.verbose:
            self.logger.warn('DEBUG mode enabled!')
            time.sleep(3)

        data_manager = DataFileManager(self.context_info.config_file_location)
        file_transactor = FileTransactor()

        file_transactor.start_threads(
            data_manager.get_file_transactor_thread_settings())

        data_manager.download_and_validate()
        self.logger.debug("finished downloading now doing thread")

        file_transactor.check_for_thread_errors()
        self.logger.debug("finished threads waiting for queues")

        file_transactor.wait_for_queues()
        self.logger.debug("finished queues waiting for shutdown")
        file_transactor.shutdown()

        neo_transactor = Neo4jTransactor()
        neo_transactor.start_threads(
            data_manager.get_neo_transactor_thread_settings())

        self.logger.debug("finished starting neo threads ")

        if not self.context_info.env["USING_PICKLE"]:
            self.logger.info("Creating indices.")
            Neo4jHelper.create_indices()

        etl_time_tracker_list = self.run_etl_groups(self.logger, data_manager,
                                                    neo_transactor)

        neo_transactor.shutdown()

        elapsed_time = time.time() - self.start_time

        for time_item in etl_time_tracker_list:
            self.logger.info(time_item)

        self.logger.info('Loader finished. Elapsed time: %s' %
                         time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
    def _load_and_process_data(self):
        # create gene descriptions data manager and load common data
        context_info = ContextInfo()
        data_manager = DataFileManager(context_info.config_file_location)
        #go_onto_config = data_manager.get_config('GO')
        go_annot_config = data_manager.get_config('GAF')
        #do_onto_config = data_manager.get_config('DOID')
        go_annot_sub_dict = {sub.get_data_provider(): sub for sub in go_annot_config.get_sub_type_objects()}
        this_dir = os.path.split(__file__)[0]
        gd_config = GenedescConfigParser(os.path.join(this_dir,
                                                      os.pardir,
                                                      os.pardir,
                                                      "gene_descriptions.yml"))
        gd_data_manager = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"])
        gd_data_manager.set_ontology(ontology_type=DataType.GO,
                                     ontology=self.get_ontology(data_type=DataType.GO),
                                     config=gd_config)
        gd_data_manager.set_ontology(ontology_type=DataType.DO,
                                     ontology=self.get_ontology(data_type=DataType.DO),
                                     config=gd_config)
        # generate descriptions for each MOD
        for prvdr in [sub_type.get_data_provider().upper() \
                      for sub_type in self.data_type_config.get_sub_type_objects()]:
            gd_config_mod_specific = copy.deepcopy(gd_config)
            if prvdr == "WB":
                gd_config_mod_specific.config["expression_sentences_options"][
                    "remove_children_if_parent_is_present"] = True
            self.logger.info("Generating gene descriptions for %s", prvdr)
            data_provider = prvdr if prvdr != "HUMAN" else "RGD"
            json_desc_writer = DescriptionsWriter()
            go_annot_path = "file://" + os.path.join(os.getcwd(),
                                                     "tmp",
                                                     go_annot_sub_dict[prvdr].file_to_download)
            gd_data_manager.load_associations_from_file(
                associations_type=DataType.GO, associations_url=go_annot_path,
                associations_cache_path=os.path.join(os.getcwd(),
                                                     "tmp",
                                                     "gd_cache",
                                                     "go_annot_" + prvdr + ".gaf"),
                config=gd_config_mod_specific)
            gd_data_manager.set_associations(associations_type=DataType.DO,
                                             associations=self.get_disease_annotations_from_db(
                                                 data_provider=data_provider,
                                                 gd_data_manager=gd_data_manager,
                                                 logger=self.logger),
                                             config=gd_config_mod_specific)
            if prvdr in EXPRESSION_PRVD_SUBTYPE_MAP:
                gd_data_manager.set_ontology(ontology_type=DataType.EXPR,
                                             ontology=self.get_ontology(data_type=DataType.EXPR,
                                                                        provider=prvdr),
                                             config=gd_config_mod_specific)
                gd_data_manager.set_associations(
                    associations_type=DataType.EXPR,
                    associations=self.get_expression_annotations_from_db(data_provider=data_provider,
                                                                         gd_data_manager=gd_data_manager,
                                                                         logger=self.logger),
                    config=gd_config_mod_specific)
            commit_size = self.data_type_config.get_neo4j_commit_size()
            generators = self.get_generators(prvdr,
                                             gd_data_manager,
                                             gd_config_mod_specific,
                                             json_desc_writer)
            query_template_list = [
                [self.gene_descriptions_query_template, commit_size,
                 "genedescriptions_data_" + prvdr + ".csv"]
            ]

            query_and_file_list = self.process_query_params(query_template_list)
            CSVTransactor.save_file_static(generators, query_and_file_list)
            Neo4jTransactor.execute_query_batch(query_and_file_list)
            self.save_descriptions_report_files(data_provider=prvdr,
                                                json_desc_writer=json_desc_writer,
                                                context_info=context_info,
                                                gd_data_manager=gd_data_manager)
Beispiel #3
0
    def get_generators(self, variant_data, batch_size):  # noqa
        """Get Generators."""

        data_providers = []
        release = ""
        variants = []
        variant_genomic_locations = []
        variant_so_terms = []
        cross_references = []
        counter = 0
        date_produced = variant_data['metaData']['dateProduced']

        self.data_providers_process(variant_data)
        load_key = date_produced + self.data_provider + "_VARIATION"

        if 'release' in variant_data['metaData']:
            release = variant_data['metaData']['release']

        assemblies = {}
        for allele_record in variant_data['data']:
            chromosome = allele_record["chromosome"]
            if chromosome.startswith("chr"):
                chromosome_str = chromosome[3:]
            else:
                chromosome_str = chromosome

            assembly = allele_record["assembly"]

            if assembly not in assemblies:
                self.logger.info(assembly)
                context_info = ContextInfo()
                data_manager = DataFileManager(
                    context_info.config_file_location)
                assemblies[assembly] = AssemblySequenceHelper(
                    assembly, data_manager)

            so_term_id = allele_record.get('type')
            genomic_reference_sequence = allele_record.get(
                'genomicReferenceSequence')
            genomic_variant_sequence = allele_record.get(
                'genomicVariantSequence')

            if genomic_reference_sequence == 'N/A':
                genomic_reference_sequence = ""
            if genomic_variant_sequence == 'N/A':
                genomic_variant_sequence = ""

            padding_left = ""
            padding_right = ""
            if allele_record.get('start') != "" and allele_record.get(
                    'end') != "":

                # not insertion
                if so_term_id != "SO:0000667" and chromosome_str != "Unmapped_Scaffold_8_D1580_D1567":
                    genomic_reference_sequence = assemblies[
                        assembly].get_sequence(chromosome_str,
                                               allele_record.get('start'),
                                               allele_record.get('end'))

                if allele_record.get('start') < allele_record.get('end'):
                    start = allele_record.get('start')
                    end = allele_record.get('end')
                else:
                    start = allele_record.get('end')
                    end = allele_record.get('start')

                padding_width = 500
                if so_term_id != "SO:0000667":  # not insertion
                    start = start - 1
                    end = end + 1

                left_padding_start = start - padding_width
                if left_padding_start < 1:
                    left_padding_start = 1

                padding_left = assemblies[assembly].get_sequence(
                    chromosome_str, left_padding_start, start)
                right_padding_end = end + padding_width
                padding_right = assemblies[assembly].get_sequence(
                    chromosome_str, end, right_padding_end)
            counter = counter + 1
            global_id = allele_record.get('alleleId')
            mod_global_cross_ref_id = ""
            cross_references = []

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    global_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            cross_ref_primary_id = allele_record.get(
                'sequenceOfReferenceAccessionNumber')
            local_cross_ref_id = cross_ref_primary_id.split(":")[1]
            prefix = cross_ref_primary_id.split(":")[0]

            cross_ref_complete_url = self.etlh.get_no_page_complete_url(
                local_cross_ref_id, prefix, global_id)
            xref_map = ETLHelper.get_xref_dict(
                local_cross_ref_id, prefix, "variant_sequence_of_reference",
                "sequence_of_reference_accession_number", global_id,
                cross_ref_complete_url,
                cross_ref_primary_id + "variant_sequence_of_reference")

            xref_map['dataId'] = global_id
            if cross_ref_primary_id is not None:
                cross_references.append(xref_map)

            if genomic_reference_sequence is not None:
                if len(genomic_reference_sequence) > 1000 and (
                        allele_record.get('type') == 'SO:1000002'
                        or allele_record.get('type') == 'SO:1000008'):
                    self.logger.debug("%s genomicReferenceSequence",
                                      allele_record.get('alleleId'))

            if genomic_variant_sequence is not None:
                if len(genomic_variant_sequence) > 1000 and (
                        allele_record.get('type')
                        in ['SO:1000002', 'SO:1000008']):
                    self.logger.debug("%s genomicVariantSequence",
                                      allele_record.get('alleleId'))

            hgvs_nomenclature, hgvs_synonym = self.get_hgvs_nomenclature(
                allele_record.get('sequenceOfReferenceAccessionNumber'),
                allele_record.get('type'), allele_record.get('start'),
                allele_record.get('end'),
                genomic_reference_sequence, genomic_variant_sequence,
                allele_record.get('assembly'), chromosome_str)

            if (genomic_reference_sequence is not None and len(genomic_reference_sequence) > 30000) \
                    or (genomic_variant_sequence is not None and len(genomic_variant_sequence)) > 30000:
                self.logger.debug(
                    "%s has too long of a sequence potentionally",
                    allele_record.get('alleleId'))

            # TODO: fix typo in MGI Submission for this variant so
            # that it doesn't list a 40K bp point mutation.
            if allele_record.get('alleleId') != 'MGI:6113870':

                variant_dataset = {
                    "hgvs_nomenclature": hgvs_nomenclature,
                    "genomicReferenceSequence": genomic_reference_sequence,
                    "genomicVariantSequence": genomic_variant_sequence,
                    "paddingLeft": padding_left,
                    "paddingRight": padding_right,
                    "alleleId": allele_record.get('alleleId'),
                    "dataProviders": data_providers,
                    "dateProduced": date_produced,
                    "loadKey": load_key,
                    "release": release,
                    "modGlobalCrossRefId": mod_global_cross_ref_id,
                    "dataProvider": self.data_provider,
                    "variantHGVSSynonym": hgvs_synonym
                }

                variant_genomic_location_dataset = {
                    "variantId": hgvs_nomenclature,
                    "assembly": allele_record.get('assembly'),
                    "chromosome": chromosome_str,
                    "start": allele_record.get('start'),
                    "end": allele_record.get('end'),
                    "uuid": str(uuid.uuid4()),
                    "dataProvider": self.data_provider
                }

                variant_so_term = {
                    "variantId": hgvs_nomenclature,
                    "soTermId": allele_record.get('type')
                }

                variant_so_terms.append(variant_so_term)
                variant_genomic_locations.append(
                    variant_genomic_location_dataset)
                variants.append(variant_dataset)

            if counter == batch_size:
                yield [
                    variants, variant_genomic_locations, variant_so_terms,
                    cross_references
                ]
                variants = []
                variant_genomic_locations = []
                variant_so_terms = []
                cross_references = []

        if counter > 0:
            yield [
                variants, variant_genomic_locations, variant_so_terms,
                cross_references
            ]
Beispiel #4
0
    def run_preprocessor(self):

        if args.verbose:
            logger.warn('DEBUG mode enabled!')
            time.sleep(3)

        start_time = time.time()
        data_manager = DataFileManager(context_info.config_file_location)
        logger.info("config_file_location %s" %
                    (context_info.config_file_location))

        ft = FileTransactor()

        ft.start_threads(data_manager.get_FT_thread_settings())
        data_manager.download_and_validate()
        logger.info("finished downloading now doing thread")
        ft.check_for_thread_errors()
        logger.info("finished threads waiting for queues")
        ft.wait_for_queues()

        logger.info("finished queues waiting for shutdown")
        ft.shutdown()

        configs_dict = {
            'INTERACTION-SOURCE-MOL': ['INTERACTION-SOURCE', 'BGI'],
            'INTERACTION-SOURCE-GEN': ['INTERACTION-SOURCE', 'BGI']
        }

        config_dict = {
            'INTERACTION-SOURCE-MOL': 'INTERACTION-SOURCE',
            'INTERACTION-SOURCE-GEN': 'INTERACTION-SOURCE'
        }

        processor_dispatch = {
            'INTERACTION-SOURCE-MOL': InteractionMolecularProcessor,
            'INTERACTION-SOURCE-GEN': InteractionGeneticProcessor
        }

        list_of_processor_groups = [[
            'INTERACTION-SOURCE-MOL', 'INTERACTION-SOURCE-GEN'
        ]]

        processor_time_tracker_list = []

        for processor_group in list_of_processor_groups:
            processor_group_start_time = time.time()
            logger.info("Starting Processor group: %s" % processor_group)
            thread_pool = []
            for processor_name in processor_group:
                logger.info("Processor Name: %s" % processor_name)

                configs = []
                for config_type in configs_dict[processor_name]:
                    config = data_manager.get_config(config_type)
                    if config is not None:
                        configs.append(config)
                    else:
                        logger.info("No Config found for: %s %s" %
                                    (processor_name, config_type))

                if len(configs) > 0:
                    processor = processor_dispatch[processor_name](configs)
                    p = multiprocessing.Process(target=processor.run_processor)
                    p.start()
                    thread_pool.append(p)
                else:
                    logger.info("No Configs found for: %s" % processor_name)

            Processor.wait_for_threads(thread_pool)

            logger.info("Waiting for Queues to sync up")
            processor_elapsed_time = time.time() - processor_group_start_time
            processor_time_message = (
                "Finished Processor group: %s, Elapsed time: %s" %
                (processor_group,
                 time.strftime("%H:%M:%S",
                               time.gmtime(processor_elapsed_time))))
            logger.info(processor_time_message)
            processor_time_tracker_list.append(processor_time_message)

        end_time = time.time()
        elapsed_time = end_time - start_time

        for time_item in processor_time_tracker_list:
            logger.info(time_item)
        logger.info('PreProcess finished. Elapsed time: %s' %
                    time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))