Esempio n. 1
0
    def _process_sub_type(self, sub_type):

        self.logger.info("Loading Phenotype Data: %s", sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        data = JSONFile().get_data(filepath)
        self.logger.info("Finished Loading Phenotype Data: %s", sub_type.get_data_provider())
        if data is None:
            self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider())
            return

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_neo4j_commit_size()
        data_provider = sub_type.get_data_provider()
        self.logger.info("subtype: " + data_provider)

        query_template_list = [
                [self.execute_gene_query_template, commit_size,
                 "phenotype_gene_data_" + sub_type.get_data_provider() + ".csv"],
                [self.execute_allele_query_template, commit_size,
                 "phenotype_allele_data_" + sub_type.get_data_provider() + ".csv"],
                [self.execute_agm_query_template, commit_size,
                 "phenotype_agm_data_" + sub_type.get_data_provider() + ".csv"],
                [self.execute_pges_allele_query_template, commit_size,
                 "phenotype_pges_allele_data_" + sub_type.get_data_provider() + ".csv"],
                [self.execute_pges_agm_query_template, commit_size,
                 "phenotype_pges_agm_data_" + sub_type.get_data_provider() + ".csv"]
        ]

        # Obtain the generator
        generators = self.get_generators(data, batch_size)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("Phenotype-{}: ".format(sub_type.get_data_provider()))
    def _process_sub_type(self, sub_type):
        self.logger.info("Loading Transcript Data: %s", sub_type.get_data_provider())
        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()
        filepath = sub_type.get_filepath()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [self.transcript_alternate_id_query_template, commit_size,
             "transcript_gff3ID_data_" + sub_type.get_data_provider() + ".csv"],
            [self.transcript_query_template, commit_size,
             "transcript_data_" + sub_type.get_data_provider() + ".csv"],
            [self.chromosomes_query_template, commit_size,
             "transcript_data_chromosome_" + sub_type.get_data_provider() + ".csv"],
            [self.genomic_locations_query_template, commit_size,
             "transcript_genomic_locations_" + sub_type.get_data_provider() + ".csv"],
            [self.exon_query_template, commit_size,
             "exon_data_" + sub_type.get_data_provider() + ".csv"],
            [self.exon_genomic_locations_template, commit_size,
             "exon_genomic_location_data_" + sub_type.get_data_provider() + ".csv"]
        ]

        # Obtain the generator
        generators = self.get_generators(filepath, batch_size)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
Esempio n. 3
0
    def _process_sub_type(self, sub_type, query_tracking_list):
        self.logger.info("Loading GOAnnot Data: %s",
                         sub_type.get_data_provider())
        filepath = sub_type.get_file_to_download()
        filepath = os.path.join('tmp/', filepath)
        self.logger.info("goannot path: %s", filepath)
        file = open(filepath, "r")

        self.logger.info("Finished Loading GOAnnot Data: %s",
                         sub_type.get_data_provider())

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        generators = self.get_generators(\
                file,
                ETLHelper.go_annot_prefix_lookup(sub_type.get_data_provider()),
                batch_size)

        query_template_list = [
            [
                self.main_query_template, commit_size,
                "go_annot_" + sub_type.get_data_provider() + ".csv"
            ],
        ]

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)

        for item in query_and_file_list:
            query_tracking_list.append(item)
Esempio n. 4
0
    def _process_sub_type(self, sub_type):
        data_provider = sub_type.get_data_provider()
        self.logger.info(data_provider)
        if data_provider == 'DOID':
            data_provider = 'DO'

        self.logger.debug("Starting isa_partof_ Closure for: %s",
                          data_provider)

        query_list = [
            [
                self.insert_isa_partof_closure_query_template, "100000",
                "isa_partof_closure_" + data_provider + ".csv", data_provider,
                data_provider
            ],
        ]

        generators = self.get_closure_terms(data_provider)

        query_and_file_list = self.process_query_params(query_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)

        self.error_messages("Closure-{}: ".format(data_provider))
        self.logger.debug("Finished isa_partof Closure for: %s", data_provider)
Esempio n. 5
0
    def _process_sub_type(self, sub_type):
        self.logger.info("Loading ECOMAP Ontology Data: %s",
                         sub_type.get_data_provider())

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        filepath = sub_type.get_filepath()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [
                self.eco_query_template, commit_size,
                "ecomap_data_" + sub_type.get_data_provider() + ".csv"
            ],
        ]

        # Obtain the generator
        generators = self.get_generators(filepath, batch_size)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)

        self.error_messages("Ecomap-{}: ".format(sub_type.get_data_provider()))
        self.logger.info("Finished Loading ECOMAP Data: %s",
                         sub_type.get_data_provider())
Esempio n. 6
0
    def _load_and_process_data(self):
        filepath = self.data_type_config.get_single_filepath()

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        generators = self.get_generators(filepath, batch_size)

        query_template_list = [
            [self.do_query_template, commit_size, "do_term_data.csv"],
            [self.doterm_isas_query_template, commit_size, "do_isas_data.csv"],
            [
                self.doterm_synonyms_query_template, commit_size,
                "do_synonyms_data.csv"
            ], [self.xrefs_query_template, commit_size, "do_xrefs_data.csv"],
            [
                self.doterm_alt_ids_query_template, commit_size,
                "do_alt_ids_data.csv"
            ]
        ]

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("DO-?: ")
    def _load_and_process_data(self):

        for sub_type in self.data_type_config.get_sub_type_objects():

            species_encoded = urllib.parse.quote_plus(\
                    ETLHelper.species_lookup_by_data_provider(sub_type.get_data_provider()))

            commit_size = self.data_type_config.get_neo4j_commit_size()
            #batch_size = self.data_type_config.get_generator_batch_size()
            batch_size = 100000

            generators = self.get_generators(sub_type, batch_size,
                                             species_encoded)

            query_template_list = [
                [
                    self.geo_xref_query_template, commit_size,
                    "geo_xref_data_" + sub_type.get_data_provider() + ".csv"
                ],
            ]

            query_and_file_list = self.process_query_params(
                query_template_list)
            CSVTransactor.save_file_static(generators, query_and_file_list)
            Neo4jTransactor.execute_query_batch(query_and_file_list)
    def _process_sub_type(self, sub_type):

        self.logger.info("Loading Sequence Targeting Reagent Data: %s",
                         sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        self.logger.info(filepath)
        data = JSONFile().get_data(filepath)
        self.logger.info(
            "Finished Loading Sequence Targeting Reagent Data: %s",
            sub_type.get_data_provider())

        if data is None:
            self.logger.warning("No Data found for %s skipping",
                                sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [
                self.agm_query_template, commit_size,
                "agm_data_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.agm_secondary_ids_query_template, commit_size,
                "agm_secondary_ids_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.agm_synonyms_query_template, commit_size,
                "agm_synonyms_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.agm_components_query_template, commit_size,
                "agm_components_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.agm_sqtrs_query_template, commit_size,
                "agm_sqtrs_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.agm_backgrounds_query_template, commit_size,
                "agm_backgrounds_" + sub_type.get_data_provider() + ".csv"
            ]
        ]

        # Obtain the generator
        generators = self.get_generators(data, sub_type.get_data_provider(),
                                         batch_size)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("AGM-{}: ".format(sub_type.get_data_provider()))
Esempio n. 9
0
    def _load_and_process_data(self):
        filepath = self.data_type_config.get_single_filepath()
        generators = self.get_generators(filepath)

        query_template_list = [[self.main_query_template, 10000, "mi_term_data.csv"]]

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
Esempio n. 10
0
    def _load_and_process_data(self):

        filepath = 'https://raw.githubusercontent.com/alliance-genome/agr_schemas/master/ingest/species/species.yaml'
        generators = self.get_generators(filepath)

        query_template_list = [[self.main_query_template, 10000, "species_data.csv"]]

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
Esempio n. 11
0
    def _process_sub_type(self, sub_type):

        logger.info("Loading HTP metadata Data: %s" %
                    sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        logger.info(filepath)
        data = JSONFile().get_data(filepath)
        logger.info("Finished Loading HTP metadata Data: %s" %
                    sub_type.get_data_provider())

        if data is None:
            logger.warn("No Data found for %s skipping" %
                        sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_list = [
            [
                HTPMetaDatasetETL.htp_dataset_query_template, commit_size,
                "htp_metadataset_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetETL.htp_category_tags_query_template,
                commit_size,
                "htp_metadataset_tags_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetETL.htp_dataset_pub_query_template, commit_size,
                "htp_metadataset_publications_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetETL.htpdataset_xrefs_template, commit_size,
                "htp_metadataset_xrefs_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                HTPMetaDatasetETL.htp_secondaryIds_query_template, commit_size,
                "htp_metadataset_secondaryIds_" +
                sub_type.get_data_provider() + ".csv"
            ],
        ]

        # Obtain the generator
        generators = self.get_generators(data, batch_size)

        query_and_file_list = self.process_query_params(query_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
    def _load_and_process_data(self):
        self.logger.info("Starting Expression Ribbon Data")

        query_template_list = [[
            self.insert_ribonless_ebes_query_template, "30000",
            "expression_ribbonless_ebes" + ".csv"
        ]]

        generators = self.get_ribbon_terms()

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)

        self.logger.info("Finished Expression Ribbon Data")
Esempio n. 13
0
    def _process_sub_type(self, sub_type):
        self.logger.info("Loading Generic Ontology Data: %s",
                         sub_type.get_data_provider())
        filepath = sub_type.get_filepath()

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        ont_type = sub_type.get_data_provider()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [
                self.generic_ontology_term_query_template, 600000,
                "generic_ontology_term_" + ont_type + ".csv", ont_type
            ],
            [
                self.generic_ontology_isas_query_template, commit_size,
                "generic_ontology_isas_" + ont_type + ".csv", ont_type,
                ont_type
            ],
            [
                self.generic_ontology_partofs_query_template, commit_size,
                "generic_ontology_partofs_" + ont_type + ".csv", ont_type,
                ont_type
            ],
            [
                self.generic_ontology_synonyms_query_template, 400000,
                "generic_ontology_synonyms_" + ont_type + ".csv", ont_type
            ],
            [
                self.generic_ontology_altids_query_template, commit_size,
                "generic_ontology_altids_" + ont_type + ".csv", ont_type
            ],
        ]

        # Obtain the generator
        generators = self.get_generators(filepath, batch_size)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)

        self.logger.info("Finished Loading Generic Ontology Data: %s",
                         sub_type.get_data_provider())
Esempio n. 14
0
    def _process_sub_type(self, sub_type, sub_types, query_tracking_list):
        self.logger.info("Loading Orthology Data: %s",
                         sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        # data = JSONFile().get_data(filepath)

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        generators = self.get_generators(filepath,
                                         sub_type.get_data_provider(),
                                         sub_types, batch_size)

        query_template_list = []

        for mod_sub_type in sub_types:
            if mod_sub_type != sub_type.get_data_provider():
                query_template_list.append([
                    self.main_query_template, "100000", "orthology_data_" +
                    sub_type.get_data_provider() + "_" + mod_sub_type + ".csv"
                ])

        query_template_list.append([
            self.matched_algorithm_query_template, commit_size,
            "orthology_matched_algorithm_data_{}.csv".format(
                sub_type.get_data_provider())
        ])
        query_template_list.append([
            self.not_matched_algorithm_query_template, commit_size,
            "orthology_not_matched_algorithm_data_" +
            sub_type.get_data_provider() + ".csv"
        ])
        query_template_list.append([
            self.not_called_algorithm_query_template, commit_size,
            "orthology_not_called_algorithm_data_" +
            sub_type.get_data_provider() + ".csv"
        ])

        query_and_file_list = self.process_query_params(query_template_list)

        CSVTransactor.save_file_static(generators, query_and_file_list)

        for item in query_and_file_list:
            query_tracking_list.append(item)

        self.error_messages("Ortho-{}: ".format(sub_type.get_data_provider()))
        self.logger.info("Finished Loading Orthology Data: %s",
                         sub_type.get_data_provider())
Esempio n. 15
0
    def _process_sub_type(self, sub_type):

        filepath = sub_type.get_single_filepath()

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        generators = self.get_generators(filepath, batch_size)

        query_template_list = [
            [self.query_template, commit_size, "stub_data.csv"],
        ]

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
Esempio n. 16
0
    def _process_sub_type(self, sub_type):

        self.logger.info("Loading Variation Data: %s",
                         sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        data = JSONFile().get_data(filepath)
        self.logger.info("Finished Loading Variation Data: %s",
                         sub_type.get_data_provider())

        if data is None:
            self.logger.warning("No Data found for %s skipping",
                                sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored

        query_template_list = [
            [
                self.variation_query_template, commit_size,
                "variation_data_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.genomic_locations_query_template, commit_size,
                "variant_genomiclocations_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.so_terms_query_template, commit_size,
                "variant_so_terms_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.xrefs_query_template, commit_size,
                "variant_xrefs_" + sub_type.get_data_provider() + ".csv"
            ]
        ]

        generators = self.get_generators(data, batch_size)
        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("Var-{}: ".format(sub_type.get_data_provider()))
Esempio n. 17
0
    def _process_sub_type(self, sub_type):
        self.logger.info("Loading VEP Data: %s", sub_type.get_data_provider())
        commit_size = self.data_type_config.get_neo4j_commit_size()
        filepath = sub_type.get_filepath()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [[
            self.vep_transcript_query_template, commit_size,
            "vep_transcript_data_" + sub_type.get_data_provider() + ".csv"
        ]]

        # Obtain the generator
        generators = self.get_generators(filepath)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
    def _process_sub_type(self, subtype):

        self.logger.info("Starting Gene Disease Ortho Data: %s", subtype)

        query_template_list = [
            [self.insert_gene_disease_ortho_query_template, "10000",
             "gene_disease_by_orthology.csv"]
        ]

        self.logger.info("gene disease ortho pub created")

        generators = self.retrieve_gene_disease_ortho()

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)

        self.logger.info("Finished Gene Disease Ortho Data")
Esempio n. 19
0
    def _process_sub_type(self, sub_type):

        self.logger.info("Loading Disease Data: %s", sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        data = JSONFile().get_data(filepath)
        self.logger.info("Finished Loading Disease Data: %s", sub_type.get_data_provider())

        if data is None:
            self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider())
            return

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [self.execute_allele_query_template, commit_size,
             "disease_allele_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_gene_query_template, commit_size,
             "disease_gene_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_agms_query_template, commit_size,
             "disease_agms_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_pges_gene_query_template, commit_size,
             "disease_pges_gene_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_pges_allele_query_template, commit_size,
             "disease_pges_allele_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_pges_agm_query_template, commit_size,
             "disease_pges_agms_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_withs_query_template, commit_size,
             "disease_withs_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_ecode_query_template, commit_size,
             "disease_evidence_code_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_annotation_xrefs_query_template, commit_size,
             "disease_annotation_xrefs_data_" + sub_type.get_data_provider() + ".csv"]
        ]

        # Obtain the generator
        generators = self.get_generators(data, batch_size, sub_type.get_data_provider())

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("Disease-{}: ".format(sub_type.get_data_provider()))
        self.logger.info("Finished Loading Disease Data: %s", sub_type.get_data_provider())
Esempio n. 20
0
    def _load_and_process_data(self):

        filepath = self.data_type_config.get_single_filepath()

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        generators = self.get_generators(filepath, batch_size)

        query_template_list = [
            [self.main_query_template, commit_size, "go_term_data.csv"],
            [self.goterm_isas_query_template, commit_size, "go_isas_data.csv"],
            [
                self.goterm_partofs_query_template, commit_size,
                "go_partofs_data.csv"
            ],
            [
                self.goterm_synonyms_query_template, commit_size,
                "go_synonym_data.csv"
            ],
            [
                self.goterm_regulates_query_template, commit_size,
                "go_regulates_data.csv"
            ],
            [
                self.goterm_negatively_regulates_query_template, commit_size,
                "go_negatively_regulates_data.csv"
            ],
            [
                self.goterm_positively_regulates_query_template, commit_size,
                "go_positively_regulates_data.csv"
            ],
            [
                self.goterm_secondary_query_template, commit_size,
                "goterm_secondary_data.csv"
            ]
        ]

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages()
    def _load_and_process_data(self):

        # filepath = self.data_type_config.get_single_filepath()
        # Temporary fix for 3.0 release.
        filepath = 'tmp/alliance_molecular_interactions.tsv'

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        generators = self.get_generators(filepath, batch_size)

        query_template_list = [
            [self.main_query_template, commit_size, "mol_int_data.csv"],
            [self.xref_query_template, commit_size, "mol_int_xref.csv"],
            [self.mod_xref_query_template, commit_size, "mol_int_mod_xref.csv"]
        ]

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
    def _process_sub_type(self, sub_type, ensg_to_gene_primary_id_map):

        data_provider = sub_type.get_data_provider()
        expression_atlas_gene_pages = self._get_expression_atlas_gene_pages(
            sub_type, data_provider, ensg_to_gene_primary_id_map)

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        generators = self.get_generators(expression_atlas_gene_pages,
                                         data_provider, batch_size)

        query_template_list = [
            [
                self.add_expression_atlas_crossreferences_query_template,
                commit_size, "expression_atlas_" + data_provider + "_data.csv"
            ],
        ]

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("ExpAtlas-{}: ".format(
            sub_type.get_data_provider()))
Esempio n. 23
0
    def _process_sub_type(self, sub_type):

        logger.info("Loading Allele Data: %s" % sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        logger.info(filepath)
        data = JSONFile().get_data(filepath)

        if data is None:
            logger.warn("No Data found for %s skipping" %
                        sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_list = [
            [
                AlleleETL.allele_gene_no_construct_query_template, commit_size,
                "allele_gene_no_construct_data_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                AlleleETL.allele_construct_gene_query_template, commit_size,
                "allele_construct_gene_data_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                AlleleETL.allele_construct_no_gene_query_template, commit_size,
                "allele_construct_no_gene_data_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                AlleleETL.allele_no_gene_no_construct_query_template,
                commit_size, "allele_no_gene_no_construct_data_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                AlleleETL.allele_secondaryids_template, commit_size,
                "allele_secondaryids_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                AlleleETL.allele_synonyms_template, commit_size,
                "allele_synonyms_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                AlleleETL.allele_xrefs_template, commit_size,
                "allele_xrefs_" + sub_type.get_data_provider() + ".csv"
            ],
        ]

        # Obtain the generator
        generators = self.get_generators(data, batch_size)

        query_and_file_list = self.process_query_params(query_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("Allele-{}: ".format(sub_type.get_data_provider()))
        logger.info("Finished Loading Allele Data: %s" %
                    sub_type.get_data_provider())
    def _process_sub_type(self, sub_type):

        logger.info("Loading HTP metadata sample data: %s" %
                    sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        logger.info(filepath)
        data = JSONFile().get_data(filepath)
        logger.info("Finished Loading HTP metadata sample data: %s" %
                    sub_type.get_data_provider())

        if data is None:
            logger.warn("No Data found for %s skipping" %
                        sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_list = [
            [
                HTPMetaDatasetSampleETL.htp_dataset_sample_query_template,
                commit_size, "htp_metadataset_sample_samples_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.
                htp_bio_entity_expression_query_template, commit_size,
                "htp_metadataset_sample_bioentities_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.htp_secondaryIds_query_template,
                commit_size, "htp_metadataset_sample_secondaryIds_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.htp_dataset_join_query_template,
                commit_size, "htp_metadataset_sample_datasets_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.htp_stages_query_template, commit_size,
                "htp_metadataset_sample_stages_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ao_terms_query_template, commit_size,
                "htp_metadataset_sample_aoterms_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ao_substructures_query_template,
                commit_size, "htp_metadataset_sample_ao_substructures_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ao_qualifiers_query_template,
                commit_size, "htp_metadataset_sample_ao_qualifiers_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ao_ss_qualifiers_query_template,
                commit_size, "htp_metadataset_sample_ao_ss_qualifiers_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.cc_term_query_template, commit_size,
                "htp_metadataset_sample_ccterms" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ccq_expression_query_template,
                commit_size, "htp_metadataset_sample_ccqterms_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.uberon_ao_query_template, commit_size,
                "htp_metadataset_sample_uberon_ao_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.uberon_ao_other_query_template,
                commit_size, "htp_metadataset_sample_uberon_ao_other_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.htp_dataset_sample_agm_query_template,
                commit_size, "htp_metadataset_sample_agms_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.
                htp_dataset_sample_agmtext_query_template, commit_size,
                "htp_metadataset_sample_agmstext_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.
                htp_dataset_sample_assemblies_query_template, commit_size,
                "htp_metadataset_sample_assemblies_" +
                sub_type.get_data_provider() + ".csv"
            ]
        ]

        # Obtain the generator
        generators = self.get_generators(data, batch_size)

        query_and_file_list = self.process_query_params(query_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
    def _load_and_process_data(self):
        # create gene descriptions data manager and load common data
        context_info = ContextInfo()
        data_manager = DataFileManager(context_info.config_file_location)
        #go_onto_config = data_manager.get_config('GO')
        go_annot_config = data_manager.get_config('GAF')
        #do_onto_config = data_manager.get_config('DOID')
        go_annot_sub_dict = {sub.get_data_provider(): sub for sub in go_annot_config.get_sub_type_objects()}
        this_dir = os.path.split(__file__)[0]
        gd_config = GenedescConfigParser(os.path.join(this_dir,
                                                      os.pardir,
                                                      os.pardir,
                                                      "gene_descriptions.yml"))
        gd_data_manager = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"])
        gd_data_manager.set_ontology(ontology_type=DataType.GO,
                                     ontology=self.get_ontology(data_type=DataType.GO),
                                     config=gd_config)
        gd_data_manager.set_ontology(ontology_type=DataType.DO,
                                     ontology=self.get_ontology(data_type=DataType.DO),
                                     config=gd_config)
        # generate descriptions for each MOD
        for prvdr in [sub_type.get_data_provider().upper() \
                      for sub_type in self.data_type_config.get_sub_type_objects()]:
            gd_config_mod_specific = copy.deepcopy(gd_config)
            if prvdr == "WB":
                gd_config_mod_specific.config["expression_sentences_options"][
                    "remove_children_if_parent_is_present"] = True
            self.logger.info("Generating gene descriptions for %s", prvdr)
            data_provider = prvdr if prvdr != "HUMAN" else "RGD"
            json_desc_writer = DescriptionsWriter()
            go_annot_path = "file://" + os.path.join(os.getcwd(),
                                                     "tmp",
                                                     go_annot_sub_dict[prvdr].file_to_download)
            gd_data_manager.load_associations_from_file(
                associations_type=DataType.GO, associations_url=go_annot_path,
                associations_cache_path=os.path.join(os.getcwd(),
                                                     "tmp",
                                                     "gd_cache",
                                                     "go_annot_" + prvdr + ".gaf"),
                config=gd_config_mod_specific)
            gd_data_manager.set_associations(associations_type=DataType.DO,
                                             associations=self.get_disease_annotations_from_db(
                                                 data_provider=data_provider,
                                                 gd_data_manager=gd_data_manager,
                                                 logger=self.logger),
                                             config=gd_config_mod_specific)
            if prvdr in EXPRESSION_PRVD_SUBTYPE_MAP:
                gd_data_manager.set_ontology(ontology_type=DataType.EXPR,
                                             ontology=self.get_ontology(data_type=DataType.EXPR,
                                                                        provider=prvdr),
                                             config=gd_config_mod_specific)
                gd_data_manager.set_associations(
                    associations_type=DataType.EXPR,
                    associations=self.get_expression_annotations_from_db(data_provider=data_provider,
                                                                         gd_data_manager=gd_data_manager,
                                                                         logger=self.logger),
                    config=gd_config_mod_specific)
            commit_size = self.data_type_config.get_neo4j_commit_size()
            generators = self.get_generators(prvdr,
                                             gd_data_manager,
                                             gd_config_mod_specific,
                                             json_desc_writer)
            query_template_list = [
                [self.gene_descriptions_query_template, commit_size,
                 "genedescriptions_data_" + prvdr + ".csv"]
            ]

            query_and_file_list = self.process_query_params(query_template_list)
            CSVTransactor.save_file_static(generators, query_and_file_list)
            Neo4jTransactor.execute_query_batch(query_and_file_list)
            self.save_descriptions_report_files(data_provider=prvdr,
                                                json_desc_writer=json_desc_writer,
                                                context_info=context_info,
                                                gd_data_manager=gd_data_manager)
Esempio n. 26
0
    def _process_sub_type(self, sub_type, query_tracking_list):

        self.logger.info("Loading BGI Data: %s", sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        if filepath is None:
            self.logger.error("Can't find input file for %s", sub_type)
            sys.exit()

        data = JSONFile().get_data(filepath)

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # gene_metadata, gene_dataset, secondary_ids, genomic_locations, cross_references, synonyms
        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [
                self.gene_metadata_query_template, commit_size,
                "gene_metadata_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.gene_query_template, commit_size,
                "gene_data_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.basic_gene_load_relations_query_template, commit_size,
                "gene_data_load_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.basic_gene_species_relations_query_template, commit_size,
                "gene_data_species_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.so_terms_query_template, commit_size,
                "gene_so_terms_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.chromosomes_query_template, commit_size,
                "gene_chromosomes_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.gene_secondary_ids_query_template, commit_size,
                "gene_secondary_ids_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.genomic_locations_query_template, commit_size,
                "gene_genomic_locations_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.xrefs_query_template, commit_size,
                "gene_cross_references_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.xrefs_relationships_query_template, commit_size,
                "gene_cross_references_relationships_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                self.gene_synonyms_query_template, 600000,
                "gene_synonyms_" + sub_type.get_data_provider() + ".csv"
            ]
        ]

        # Obtain the generator
        generators = self.get_generators(data, sub_type.get_data_provider(),
                                         batch_size)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)

        for item in query_and_file_list:
            query_tracking_list.append(item)

        self.error_messages("BGI-{}: ".format(sub_type.get_data_provider()))
        self.logger.info("Finished Loading BGI Data: %s",
                         sub_type.get_data_provider())
Esempio n. 27
0
    def _process_sub_type(self, sub_type, query_tracking_list):

        self.logger.info("Loading Expression Data: %s",
                         sub_type.get_data_provider())
        data_file = sub_type.get_filepath()
        data_provider = sub_type.get_data_provider()

        if data_file is None:
            self.logger.warning("No Data found for %s skipping",
                                sub_type.get_data_provider())
            return

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [
                self.bio_entity_expression_query_template, commit_size,
                "expression_entities_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.bio_entity_gene_ao_query_template, commit_size,
                "expression_gene_ao_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.bio_entity_gene_expression_join_query_template,
                commit_size, "expression_entity_joins_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                self.ao_expression_query_template, commit_size,
                "expression_ao_expression_" + sub_type.get_data_provider() +
                ".csv"
            ]
        ]

        if data_provider == 'SGD':
            query_template_list += [[
                self.sgd_cc_expression_query_template, commit_size,
                "expression_SGD_cc_expression_" +
                sub_type.get_data_provider() + ".csv"
            ]]
        else:
            query_template_list += [[
                self.cc_expression_query_template, commit_size,
                "expression_cc_expression_" + sub_type.get_data_provider() +
                ".csv"
            ]]

        query_template_list += [
            [
                self.ao_cc_expression_query_template, commit_size,
                "expression_ao_cc_expression_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.eas_qualified_query_template, commit_size,
                "expression_eas_qualified_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.eas_substructure_query_template, commit_size,
                "expression_eas_substructure_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.eass_qualified_query_template, commit_size,
                "expression_eass_qualified_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.ccq_expression_query_template, commit_size,
                "expression_ccq_expression_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.stage_expression_query_template, commit_size,
                "expression_stage_expression_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.uberon_stage_query_template, commit_size,
                "expression_uberon_stage_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.uberon_ao_query_template, commit_size,
                "expression_uberon_ao_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.uberon_ao_other_query_template, commit_size,
                "expression_uberon_ao_other_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.uberon_stage_other_query_template, commit_size,
                "expression_uberon_stage_other_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                self.xrefs_query_template, commit_size,
                "expression_cross_references_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.add_pubs_query_template, commit_size,
                "expression_add_pubs_" + sub_type.get_data_provider() + ".csv"
            ]
        ]

        # Obtain the generator
        generators = self.get_generators(data_file, batch_size)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)

        for item in query_and_file_list:
            query_tracking_list.append(item)

        self.logger.info("Finished Loading Expression Data: %s",
                         sub_type.get_data_provider())
Esempio n. 28
0
    def _process_sub_type(self, sub_type):

        self.logger.info("Loading Construct Data: %s",
                         sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        data = JSONFile().get_data(filepath)
        self.logger.info("Finished Loading Construct Data: %s",
                         sub_type.get_data_provider())

        if data is None:
            self.logger.warning("No Data found for %s skipping",
                                sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [
                ConstructETL.construct_query_template, commit_size,
                "Construct_data_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.construct_secondary_ids_query_template,
                commit_size, "Construct_secondary_ids_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.construct_synonyms_query_template, commit_size,
                "Construct_synonyms_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.construct_xrefs_query_template, commit_size,
                "Construct_xrefs_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.non_bgi_component_query_template, commit_size,
                "Construct_non_bgi_component_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                ConstructETL.construct_gene_component_query_template,
                commit_size, "Construct_components_gene" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.construct_no_gene_component_query_template,
                commit_size, "Construct_components_no_gene" +
                sub_type.get_data_provider() + ".csv"
            ]
        ]

        # Obtain the generator
        generators = self.get_generators(data, sub_type.get_data_provider(),
                                         batch_size)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)