def insert_browsable_files(self): with self.metadata_connection_handle as conn: # insert into browsable file table, if files not already there files_query = f"select file_id, filename from evapro.browsable_file " \ f"where project_accession = '{self.project_accession}';" rows = get_all_results_for_query(conn, files_query) if len(rows) > 0: self.info('Browsable files already inserted, skipping') return self.info('Inserting browsable files...') insert_query = "insert into browsable_file (file_id,ena_submission_file_id,filename,project_accession,assembly_set_id) " \ "select file.file_id,ena_submission_file_id,filename,project_accession,assembly_set_id " \ "from (select * from analysis_file af " \ "join analysis a on a.analysis_accession = af.analysis_accession " \ "join project_analysis pa on af.analysis_accession = pa.analysis_accession " \ f"where pa.project_accession = '{self.project_accession}' ) myfiles " \ "join file on file.file_id = myfiles.file_id where file.file_type ilike 'vcf';" execute_query(conn, insert_query) # update loaded and release date release_date = self.eload_cfg.query('brokering', 'ena', 'hold_date') release_update = f"update evapro.browsable_file " \ f"set loaded = true, eva_release = '{release_date.strftime('%Y%m%d')}' " \ f"where project_accession = '{self.project_accession}';" execute_query(conn, release_update) # update FTP file paths rows = get_all_results_for_query(conn, files_query) if len(rows) == 0: raise ValueError('Something went wrong with loading from ENA') for file_id, filename in rows: ftp_update = f"update evapro.file " \ f"set ftp_file = '/ftp.ebi.ac.uk/pub/databases/eva/{self.project_accession}/{filename}' " \ f"where file_id = '{file_id}';" execute_query(conn, ftp_update)
def get_snpmapinfo_asm_columns(species_info, snpmapinfo_table_name): accession_columns = ['asm_acc', 'asm_version'] assembly_name_columns = ['assembly'] with get_db_conn_for_species(species_info) as pg_conn_for_species: accession_columns_query = "select distinct table_schema from information_schema.columns " \ "where table_schema = 'dbsnp_{0}' and table_name = '{1}' " \ "and column_name in ({2});".format(species_info["database_name"], snpmapinfo_table_name, ",".join(["'{0}'".format(name) for name in accession_columns])) assembly_name_columns_query = "select distinct table_schema from information_schema.columns " \ "where table_schema = 'dbsnp_{0}' and table_name = '{1}' " \ "and column_name in ({2});".format(species_info["database_name"], snpmapinfo_table_name, ",".join(["'{0}'".format(name) for name in assembly_name_columns])) available_column_names = get_all_results_for_query(pg_conn_for_species, accession_columns_query) if len(available_column_names) > 0: return [accession_columns[0], accession_columns[1]] available_column_names = get_all_results_for_query(pg_conn_for_species, assembly_name_columns_query) if len(available_column_names) > 0: return [assembly_name_columns[0]] else: logger.error("No assembly related column names found for the table dbsnp_{0}.{1}" .format(species_info["database_name"], snpmapinfo_table_name)) return []
def get_species_info(self): """Adds species info into the config: taxonomy id and scientific name, and assembly accession, fasta, and report.""" with self.metadata_connection_handle as conn: query = f"select a.taxonomy_id, b.scientific_name " \ f"from project_taxonomy a " \ f"join taxonomy b on a.taxonomy_id=b.taxonomy_id " \ f"where a.project_accession='{self.project_accession}';" rows = get_all_results_for_query(conn, query) if len(rows) < 1: raise ValueError(f'No taxonomy for {self.project_accession} found in metadata DB.') elif len(rows) > 1: raise ValueError(f'Multiple taxonomy for {self.project_accession} found in metadata DB.') tax_id, sci_name = rows[0] self.eload_cfg.set('submission', 'taxonomy_id', value=tax_id) self.eload_cfg.set('submission', 'scientific_name', value=sci_name) with self.metadata_connection_handle as conn: query = f"select distinct b.analysis_accession, b.vcf_reference_accession " \ f"from project_analysis a " \ f"join analysis b on a.analysis_accession=b.analysis_accession " \ f"where a.project_accession='{self.project_accession}' and b.hidden_in_eva=0;" rows = get_all_results_for_query(conn, query) if len(rows) < 1: raise ValueError(f'No reference accession for {self.project_accession} found in metadata DB.') for analysis_accession, asm_accession in rows: self.eload_cfg.set('submission', 'analyses', analysis_accession, 'assembly_accession', value=asm_accession) fasta_path, report_path = get_reference_fasta_and_report(sci_name, asm_accession) self.eload_cfg.set('submission', 'analyses', analysis_accession, 'assembly_fasta', value=fasta_path) self.eload_cfg.set('submission', 'analyses', analysis_accession, 'assembly_report', value=report_path)
def insert_browsable_files(self): with self.metadata_connection_handle as conn: # insert into browsable file table, if files not already there files_query = ( f"select file_id, ena_submission_file_id,filename,project_accession,assembly_set_id " f"from evapro.browsable_file " f"where project_accession = '{self.project_accession}';") rows_in_table = get_all_results_for_query(conn, files_query) find_browsable_files_query = ( "select file.file_id,ena_submission_file_id,filename,project_accession,assembly_set_id " "from (select * from analysis_file af " "join analysis a on a.analysis_accession = af.analysis_accession " "join project_analysis pa on af.analysis_accession = pa.analysis_accession " f"where pa.project_accession = '{self.project_accession}' ) myfiles " "join file on file.file_id = myfiles.file_id where file.file_type ilike 'vcf';" ) rows_expected = get_all_results_for_query(conn, files_query) if len(rows_in_table) > 0: if set(rows_in_table) == set(rows_expected): self.info('Browsable files already inserted, skipping') else: self.warning( f'Found {len(rows_in_table)} browsable file rows in the table but they are different ' f'from the expected ones: ' f'{os.linesep + os.linesep.join([str(row) for row in rows_expected])}' ) else: self.info('Inserting browsable files...') insert_query = ( "insert into browsable_file (file_id,ena_submission_file_id,filename,project_accession," "assembly_set_id) " + find_browsable_files_query) execute_query(conn, insert_query)
def analysis_accessions(self): if self._preset_analysis_accessions: with self.metadata_connection_handle as conn: query = ( f"select distinct analysis_accession from analysis " f"where analysis in {list_to_sql_in_list(self._preset_analysis_accessions)}" f" and hidden_in_eva=0;") rows = get_all_results_for_query(conn, query) if len(rows) != len(self._preset_analysis_accessions): raise ValueError( f"Some analysis accession could not be found for analyses " f"{', '.join(self._preset_analysis_accessions)} in metadata DB." ) else: with self.metadata_connection_handle as conn: query = ( f"select distinct b.analysis_accession from project_analysis a " f"join analysis b on a.analysis_accession=b.analysis_accession " f"where a.project_accession='{self.project_accession}' and b.hidden_in_eva=0;" ) rows = get_all_results_for_query(conn, query) if len(rows) == 0: raise ValueError( f'No analysis accession could be found for project {self.project_accession} ' f'in metadata DB.') return [row[0] for row in rows]
def get_assembly_list_for_taxonomy(private_config_xml_file, taxonomy, release_version): assembly_list = set() with get_metadata_connection_handle("production", private_config_xml_file) as pg_conn: query = f'SELECT assembly_accession from evapro.assembly where taxonomy_id = {taxonomy}' for assembly in get_all_results_for_query(pg_conn, query): assembly_list.add(assembly[0]) query = (f"SELECT distinct assembly_accession " f"from eva_progress_tracker.clustering_release_tracker " f"where taxonomy={taxonomy} and release_version={release_version} and assembly_accession!='Unmapped'") for assembly in get_all_results_for_query(pg_conn, query): assembly_list.add(assembly[0]) return list(assembly_list)
def get_contigs_accessions_for(pg_conn, accession): db_query = ("select distinct contig_accession " "from eva_tasks.eva2150_mongo_genbank_contigs " "where source='%s' AND assembly_accession='%s'") all_eva_contigs = [ contig for contig, in get_all_results_for_query( pg_conn, db_query % ('submittedVariantEntity', accession)) ] all_dbsnp_contigs = [ contig for contig, in get_all_results_for_query( pg_conn, db_query % ('dbsnpSubmittedVariantOperationEntity', accession)) ] return all_eva_contigs, all_dbsnp_contigs
def project_accession(self): with self.metadata_connection_handle as conn: query = f"select project_accession from evapro.project_eva_submission where eload_id={self.eload_num};" rows = get_all_results_for_query(conn, query) if len(rows) != 1: raise ValueError(f'No project accession for {self.eload} found in metadata DB.') return rows[0][0]
def get_assemblies_from_evapro(profile, private_config_xml_file): with get_metadata_connection_handle(profile, private_config_xml_file) as pg_conn: query = "select distinct assembly_accession from evapro.accessioned_assembly where assembly_accession like 'GCA%'" \ " union " \ "select distinct assembly_accession from eva_progress_tracker.remapping_tracker where assembly_accession like 'GCA%'" evapro_assemblies = get_all_results_for_query(pg_conn, query) return [asm[0] for asm in evapro_assemblies]
def get_study_name(self): with self.metadata_connection_handle as conn: query = f"SELECT title FROM evapro.project WHERE project_accession='{self.project_accession}';" rows = get_all_results_for_query(conn, query) if len(rows) != 1: raise ValueError(f'More than one project with accession {self.project_accession} found in metadata DB.') return rows[0][0]
def fill_in_table_from_remapping(private_config_xml_file, release_version, reference_directory): query_retrieve_info = ( "select taxonomy, scientific_name, assembly_accession, string_agg(distinct source, ', '), sum(num_ss_ids)" "from eva_progress_tracker.remapping_tracker " f"where release_version={release_version} " "group by taxonomy, scientific_name, assembly_accession") with get_metadata_connection_handle("development", private_config_xml_file) as pg_conn: for taxonomy, scientific_name, assembly_accession, sources, num_ss_id in get_all_results_for_query( pg_conn, query_retrieve_info): if num_ss_id == 0: # Do not release species with no data continue should_be_clustered = True should_be_released = True ncbi_assembly = NCBIAssembly(assembly_accession, scientific_name, reference_directory) fasta_path = ncbi_assembly.assembly_fasta_path report_path = ncbi_assembly.assembly_report_path tempmongo_instance = get_tempmongo_instance(pg_conn, taxonomy) release_folder_name = normalise_taxon_scientific_name( scientific_name) query_insert = ( 'INSERT INTO eva_progress_tracker.clustering_release_tracker ' '(sources, taxonomy, scientific_name, assembly_accession, release_version, should_be_clustered, ' 'fasta_path, report_path, tempmongo_instance, should_be_released, release_folder_name) ' f"VALUES ('{sources}', {taxonomy}, '{scientific_name}', '{assembly_accession}', {release_version}, " f"{should_be_clustered}, '{fasta_path}', '{report_path}', '{tempmongo_instance}', {should_be_released}, " f"'{release_folder_name}') ON CONFLICT DO NOTHING") execute_query(pg_conn, query_insert)
def update_assembly_set_in_analysis(self): taxonomy = self.eload_cfg.query('submission', 'taxonomy_id') analyses = self.eload_cfg.query('submission', 'analyses') with self.metadata_connection_handle as conn: for analysis_alias, analysis_data in analyses.items(): assembly_accession = analysis_data['assembly_accession'] assembly_set_id = get_assembly_set(conn, taxonomy, assembly_accession) analysis_accession = self.eload_cfg.query( 'brokering', 'ena', 'ANALYSIS', analysis_alias) # Check if the update is needed check_query = ( f"select assembly_set_id from evapro.analysis " f"where analysis_accession = '{analysis_accession}';") res = get_all_results_for_query(conn, check_query) if res and res[0][0] != assembly_set_id: if res[0][0]: self.error( f'Previous assembly_set_id {res[0][0]} for {analysis_accession} was wrong and ' f'will be updated to {assembly_set_id}') analysis_update = ( f"update evapro.analysis " f"set assembly_set_id = '{assembly_set_id}' " f"where analysis_accession = '{analysis_accession}';") execute_query(conn, analysis_update)
def find_variants_studies_eligible_for_migration(private_config_xml_file, migration_start_time, migration_end_time): with psycopg2.connect(get_pg_uri_for_variant_profile( "production", private_config_xml_file), user="******") as metadata_connection_handle: query_string = f"select bjep.job_execution_id, bjep.key_name, bjep.string_val, bje.start_time \ from batch_job_execution bje join batch_job_execution_params bjep \ on bje.job_execution_id=bjep.job_execution_id \ where bjep.key_name in ('{study_key}', '{analysis_key}', '{mongodb_key}') \ and bje.start_time between '{migration_start_time}' and '{migration_end_time}'\ order by bjep.job_execution_id desc , bjep.key_name" query_result = get_all_results_for_query(metadata_connection_handle, query_string) logger.info(f"\nStudies eligible for migration : {query_result}") job_parameter_combine = defaultdict(dict) for job_id, key_name, key_value, start_time in query_result: job_parameter_combine[job_id][key_name] = key_value db_study_dict = defaultdict(set) for key, val in job_parameter_combine.items(): db_study_dict[val[mongodb_key]].add( (val[study_key], val[analysis_key])) return db_study_dict
def restore_table(self, table_name, where_clause): query = f"select * from {table_name} where {where_clause}" rows = get_all_results_for_query(self.prod_conn, query) for row in rows: query = f'insert into {table_name} VALUES ' + serialise_tuple(row) print(query) self.dev_cursor.execute(query)
def get_assemblies_from_evapro(private_config_xml_file): metadata_handle = psycopg2.connect(get_pg_metadata_uri_for_eva_profile( "development", private_config_xml_file), user="******") query = "select assembly_accession from accessioned_assembly where assembly_accession like 'GCA%'" evapro_assemblies = get_all_results_for_query(metadata_handle, query) return [asm[0] for asm in evapro_assemblies]
def get_project_alias(project_accession): with get_metadata_connection_handle(cfg['maven']['environment'], cfg['maven']['settings_file']) as conn: query = f"select alias from evapro.project where project_accession='{project_accession}';" rows = get_all_results_for_query(conn, query) if len(rows) != 1: raise ValueError(f'No project alias for {project_accession} found in metadata DB.') return rows[0][0]
def get_assemblies_to_import_for_dbsnp_species(metadata_connection_handle, dbsnp_species_taxonomy, release_version): query = "select distinct assembly_accession from dbsnp_ensembl_species.release_assemblies " \ "where release_version='{0}' " \ "and data_source='dbSNP' and tax_id='{1}'".format(release_version, dbsnp_species_taxonomy) results = get_all_results_for_query(metadata_connection_handle, query) if len(results) > 0: return [result[0] for result in results] return []
def check_if_entry_exists_for_taxonomy_and_assembly(pg_conn, taxonomy, assembly): logger.info( f'check if entry exists for taxonomy({taxonomy}) and assembly({assembly})' ) query_check = f"""SELECT count(*) from eva_progress_tracker.clustering_release_tracker WHERE taxonomy={taxonomy} and assembly_accession='{assembly}'""" result = get_all_results_for_query(pg_conn, query_check) return True if result[0][0] != 0 else False
def get_all_taxonomies_from_eva(private_config_xml_file): taxonomy_list = [] with get_metadata_connection_handle("development", private_config_xml_file) as pg_conn: query = 'SELECT DISTINCT taxonomy_id FROM evapro.taxonomy' for taxonomy in get_all_results_for_query(pg_conn, query): taxonomy_list.append(taxonomy[0]) return taxonomy_list
def info_for_release_version(version): results = get_all_results_for_query( metadata_connection_handle, "select distinct release_folder_name from {0} " "where taxonomy = '{1}' " "and release_version = {2}".format( release_properties.release_species_inventory_table, taxonomy_id, version)) return results[0][0] if len(results) > 0 else None
def get_assemblies_with_multimap_snps_for_species(metadata_connection_handle): assembly_GCA_accession_map = defaultdict(dict) query = "select distinct database_name, assembly, assembly_accession " \ "from dbsnp_ensembl_species.EVA2015_snpmapinfo_asm_lookup " \ "where assembly_accession is not null" for result in get_all_results_for_query(metadata_connection_handle, query): species_name, assembly, GCA_accession = result assembly_GCA_accession_map[species_name][assembly] = GCA_accession return assembly_GCA_accession_map
def get_tempmongo_instance(pg_conn, taxonomy_id): if not taxonomy_tempmongo_instance: query = "select taxonomy, tempmongo_instance from eva_progress_tracker.clustering_release_tracker where tempmongo_instance is not null" for taxonomy, tempmongo_instance in get_all_results_for_query(pg_conn, query): taxonomy_tempmongo_instance[taxonomy] = tempmongo_instance if taxonomy_id not in taxonomy_tempmongo_instance: taxonomy_tempmongo_instance[taxonomy_id] = next(tempmongo_instances) return taxonomy_tempmongo_instance[taxonomy_id]
def get_assemblies_and_scientific_name_from_taxonomy(taxonomy_id, metadata_connection_handle, clustering_tracking_table, release_version): query = (f"SELECT assembly_accession, scientific_name FROM {clustering_tracking_table} " f"WHERE taxonomy = '{taxonomy_id}' " f"and release_version = {release_version} " f"and assembly_accession <> 'Unmapped' " f"and should_be_clustered = 't'") results = get_all_results_for_query(metadata_connection_handle, query) if len(results) == 0: raise Exception("Could not find assemblies pertaining to taxonomy ID: " + taxonomy_id) return [result[0] for result in results], results[0][1]
def lookup_GCA_assembly(species_name, snpmapinfo_table_name, asm, metadata_connection_handle): query = "select assembly_accession from dbsnp_ensembl_species.EVA2015_snpmapinfo_asm_lookup " \ "where database_name = '{0}' and snpmapinfo_table_name = '{1}' and assembly = '{2}' " \ "and assembly_accession <> 'unresolved' and assembly_accession is not null"\ .format(species_name, snpmapinfo_table_name, asm) results = get_all_results_for_query(metadata_connection_handle, query) if len(results) == 0: raise Exception("Could not look up GCA accession for {0} table for the species {1}" .format(snpmapinfo_table_name, species_name)) return results[0][0]
def get_release_assemblies_for_taxonomy(taxonomy_id, release_species_inventory_table, release_version, metadata_connection_handle): results = get_all_results_for_query(metadata_connection_handle, "select assembly from {0} " "where taxonomy_id = '{1}' " "and release_version = {2} and should_be_processed " "and number_variants_to_process > 0" .format(release_species_inventory_table, taxonomy_id, release_version)) if len(results) == 0: raise Exception("Could not find assemblies pertaining to taxonomy ID: " + taxonomy_id) return [result[0] for result in results]
def get_most_recent_timestamp(private_config_xml_file, ftp_table_name): with get_metadata_connection_handle( 'development', private_config_xml_file) as metadata_connection_handle: results = get_all_results_for_query( metadata_connection_handle, f"select max(event_ts_txt) as recent_ts from {ftp_table_name};") if results and results[0][0]: return results[0][0] return None
def get_file_id_from_md5(md5): with get_metadata_connection_handle(cfg['maven']['environment'], cfg['maven']['settings_file']) as conn: query = f"select file_id from file where file_md5='{md5}'" rows = get_all_results_for_query(conn, query) file_ids = [file_id for file_id, in rows] if len(file_ids) > 1: raise ValueError(f'Multiple file found with md5 {md5}') elif len(file_ids) == 0: return None return file_ids[0]
def process_one_assembly(assembly, resume): # Check the original remapping tracking table for the appropriate taxonomies and target assemblies query = ( 'SELECT taxonomy, scientific_name, assembly_accession ' 'FROM eva_progress_tracker.remapping_tracker ' f"WHERE origin_assembly_accession='{assembly}' " "AND source='DBSNP'" ) with get_metadata_connection_handle(cfg['maven']['environment'], cfg['maven']['settings_file']) as pg_conn: for taxid, scientific_name, target_assembly in get_all_results_for_query(pg_conn, query): process_one_taxonomy(assembly, taxid, scientific_name, target_assembly, resume)
def get_tax_asm_source_from_eva(private_config_xml_file): eva_tax_asm = {} with get_metadata_connection_handle("development", private_config_xml_file) as pg_conn: query = f"""SELECT DISTINCT taxonomy_id, source, assembly_id FROM {remapping_genome_target_table} WHERE current=TRUE""" for tax_id, source, assembly in get_all_results_for_query( pg_conn, query): eva_tax_asm[tax_id] = {'assembly': assembly, 'source': source} return eva_tax_asm
def get_snpmapinfo_table_names_for_species(species_info): query = """select table_name from information_schema.tables where lower(table_name) like '%snpmapinfo%' and table_schema = 'dbsnp_{0}' """ with get_db_conn_for_species(species_info) as pg_conn_for_species: results = get_all_results_for_query(pg_conn_for_species, query.format(species_info["database_name"])) if len(results) > 0: if len(results[0]) > 0: return [result[0] for result in results] return []