def _load_and_process_data(self): for sub_type in self.data_type_config.get_sub_type_objects(): species_encoded = urllib.parse.quote_plus(\ ETLHelper.species_lookup_by_data_provider(sub_type.get_data_provider())) commit_size = self.data_type_config.get_neo4j_commit_size() #batch_size = self.data_type_config.get_generator_batch_size() batch_size = 100000 generators = self.get_generators(sub_type, batch_size, species_encoded) query_template_list = [ [ self.geo_xref_query_template, commit_size, "geo_xref_data_" + sub_type.get_data_provider() + ".csv" ], ] query_and_file_list = self.process_query_params( query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def save_descriptions_report_files(self, data_provider, json_desc_writer, context_info, gd_data_manager): """Save Descripitons Report Files""" release_version = ".".join(context_info.env["ALLIANCE_RELEASE"].split(".")[0:2]) json_desc_writer.overall_properties.species = data_provider json_desc_writer.overall_properties.release_version = release_version json_desc_writer.overall_properties.date = self.cur_date file_name = self.cur_date + "_" + data_provider file_path = os.path.join("tmp", file_name) json_desc_writer.write_json(file_path=file_path + ".json", pretty=True, include_single_gene_stats=True, data_manager=gd_data_manager) json_desc_writer.write_plain_text(file_path=file_path + ".txt") readme = "This file contains the following fields: gene ID, gene name, and gene description. The gene " \ "descriptions are generated by an algorithm developed by the Alliance that uses highly structured " \ "gene data such as associations to various ontology terms (e.g., Gene Ontology terms) and the " \ "Alliance strict orthology set. The original set of ontology terms that a gene is annotated to may " \ "have been trimmed to an ancestor term in the ontology, in order to balance readability with the " \ "amount of information in the description. The complete set of annotations to any gene in this file " \ "may be found in the relevant data tables on the Alliance gene page." species = ETLHelper.species_lookup_by_data_provider(data_provider) taxon_id = ETLHelper.get_taxon_from_mod(data_provider) header = create_header(file_type='Gene Descriptions', database_version=context_info.env["ALLIANCE_RELEASE"], data_format='txt', readme=readme, species=species, taxon_ids='# TaxonIDs:NCBITaxon:' + taxon_id) header = "\n".join([line.strip() for line in header.splitlines() if len(line.strip()) != 0]) self.add_header_to_file(file_path=file_path + ".txt", header=header) json_desc_writer.write_tsv(file_path=file_path + ".tsv") header = create_header(file_type='Gene Descriptions', database_version=context_info.env["ALLIANCE_RELEASE"], data_format='tsv', readme=readme, species=species, taxon_ids='# TaxonIDs:NCBITaxon:' + taxon_id) header = "\n".join([line.strip() for line in header.splitlines() if len(line.strip()) != 0]) self.add_header_to_file(file_path=file_path + ".tsv", header=header) if context_info.env["GENERATE_REPORTS"]: self.upload_files_to_fms(file_path, context_info, data_provider, self.logger)