Esempio n. 1
0
    def insert_browsable_files(self):
        with self.metadata_connection_handle as conn:
            # insert into browsable file table, if files not already there
            files_query = f"select file_id, filename from evapro.browsable_file " \
                          f"where project_accession = '{self.project_accession}';"
            rows = get_all_results_for_query(conn, files_query)
            if len(rows) > 0:
                self.info('Browsable files already inserted, skipping')
                return
            self.info('Inserting browsable files...')
            insert_query = "insert into browsable_file (file_id,ena_submission_file_id,filename,project_accession,assembly_set_id) " \
                           "select file.file_id,ena_submission_file_id,filename,project_accession,assembly_set_id " \
                           "from (select * from analysis_file af " \
                           "join analysis a on a.analysis_accession = af.analysis_accession " \
                           "join project_analysis pa on af.analysis_accession = pa.analysis_accession " \
                           f"where pa.project_accession = '{self.project_accession}' ) myfiles " \
                           "join file on file.file_id = myfiles.file_id where file.file_type ilike 'vcf';"
            execute_query(conn, insert_query)

            # update loaded and release date
            release_date = self.eload_cfg.query('brokering', 'ena', 'hold_date')
            release_update = f"update evapro.browsable_file " \
                             f"set loaded = true, eva_release = '{release_date.strftime('%Y%m%d')}' " \
                             f"where project_accession = '{self.project_accession}';"
            execute_query(conn, release_update)

            # update FTP file paths
            rows = get_all_results_for_query(conn, files_query)
            if len(rows) == 0:
                raise ValueError('Something went wrong with loading from ENA')
            for file_id, filename in rows:
                ftp_update = f"update evapro.file " \
                             f"set ftp_file = '/ftp.ebi.ac.uk/pub/databases/eva/{self.project_accession}/{filename}' " \
                             f"where file_id = '{file_id}';"
                execute_query(conn, ftp_update)
def get_snpmapinfo_asm_columns(species_info, snpmapinfo_table_name):
    accession_columns = ['asm_acc', 'asm_version']
    assembly_name_columns = ['assembly']
    with get_db_conn_for_species(species_info) as pg_conn_for_species:
        accession_columns_query = "select distinct table_schema from information_schema.columns " \
                                  "where table_schema = 'dbsnp_{0}' and table_name = '{1}' " \
                                  "and column_name in ({2});".format(species_info["database_name"],
                                                                     snpmapinfo_table_name,
                                                                     ",".join(["'{0}'".format(name)
                                                                               for name in accession_columns]))
        assembly_name_columns_query = "select distinct table_schema from information_schema.columns " \
                                      "where table_schema = 'dbsnp_{0}' and table_name = '{1}' " \
                                      "and column_name in ({2});".format(species_info["database_name"],
                                                                         snpmapinfo_table_name,
                                                                         ",".join(["'{0}'".format(name)
                                                                                   for name in assembly_name_columns]))
        available_column_names = get_all_results_for_query(pg_conn_for_species, accession_columns_query)
        if len(available_column_names) > 0:
            return [accession_columns[0], accession_columns[1]]

        available_column_names = get_all_results_for_query(pg_conn_for_species, assembly_name_columns_query)
        if len(available_column_names) > 0:
            return [assembly_name_columns[0]]
        else:
            logger.error("No assembly related column names found for the table dbsnp_{0}.{1}"
                         .format(species_info["database_name"], snpmapinfo_table_name))
            return []
    def get_species_info(self):
        """Adds species info into the config: taxonomy id and scientific name,
        and assembly accession, fasta, and report."""
        with self.metadata_connection_handle as conn:
            query = f"select a.taxonomy_id, b.scientific_name " \
                    f"from project_taxonomy a " \
                    f"join taxonomy b on a.taxonomy_id=b.taxonomy_id " \
                    f"where a.project_accession='{self.project_accession}';"
            rows = get_all_results_for_query(conn, query)
        if len(rows) < 1:
            raise ValueError(f'No taxonomy for {self.project_accession} found in metadata DB.')
        elif len(rows) > 1:
            raise ValueError(f'Multiple taxonomy for {self.project_accession} found in metadata DB.')
        tax_id, sci_name = rows[0]
        self.eload_cfg.set('submission', 'taxonomy_id', value=tax_id)
        self.eload_cfg.set('submission', 'scientific_name', value=sci_name)

        with self.metadata_connection_handle as conn:
            query = f"select distinct b.analysis_accession, b.vcf_reference_accession " \
                    f"from project_analysis a " \
                    f"join analysis b on a.analysis_accession=b.analysis_accession " \
                    f"where a.project_accession='{self.project_accession}' and b.hidden_in_eva=0;"
            rows = get_all_results_for_query(conn, query)
        if len(rows) < 1:
            raise ValueError(f'No reference accession for {self.project_accession} found in metadata DB.')
        for analysis_accession, asm_accession in rows:
            self.eload_cfg.set('submission', 'analyses', analysis_accession, 'assembly_accession', value=asm_accession)
            fasta_path, report_path = get_reference_fasta_and_report(sci_name, asm_accession)
            self.eload_cfg.set('submission', 'analyses', analysis_accession, 'assembly_fasta', value=fasta_path)
            self.eload_cfg.set('submission', 'analyses', analysis_accession, 'assembly_report', value=report_path)
Esempio n. 4
0
 def insert_browsable_files(self):
     with self.metadata_connection_handle as conn:
         # insert into browsable file table, if files not already there
         files_query = (
             f"select file_id, ena_submission_file_id,filename,project_accession,assembly_set_id "
             f"from evapro.browsable_file "
             f"where project_accession = '{self.project_accession}';")
         rows_in_table = get_all_results_for_query(conn, files_query)
         find_browsable_files_query = (
             "select file.file_id,ena_submission_file_id,filename,project_accession,assembly_set_id "
             "from (select * from analysis_file af "
             "join analysis a on a.analysis_accession = af.analysis_accession "
             "join project_analysis pa on af.analysis_accession = pa.analysis_accession "
             f"where pa.project_accession = '{self.project_accession}' ) myfiles "
             "join file on file.file_id = myfiles.file_id where file.file_type ilike 'vcf';"
         )
         rows_expected = get_all_results_for_query(conn, files_query)
         if len(rows_in_table) > 0:
             if set(rows_in_table) == set(rows_expected):
                 self.info('Browsable files already inserted, skipping')
             else:
                 self.warning(
                     f'Found {len(rows_in_table)} browsable file rows in the table but they are different '
                     f'from the expected ones: '
                     f'{os.linesep + os.linesep.join([str(row) for row in rows_expected])}'
                 )
         else:
             self.info('Inserting browsable files...')
             insert_query = (
                 "insert into browsable_file (file_id,ena_submission_file_id,filename,project_accession,"
                 "assembly_set_id) " + find_browsable_files_query)
             execute_query(conn, insert_query)
Esempio n. 5
0
 def analysis_accessions(self):
     if self._preset_analysis_accessions:
         with self.metadata_connection_handle as conn:
             query = (
                 f"select distinct analysis_accession from analysis "
                 f"where analysis in {list_to_sql_in_list(self._preset_analysis_accessions)}"
                 f" and hidden_in_eva=0;")
             rows = get_all_results_for_query(conn, query)
             if len(rows) != len(self._preset_analysis_accessions):
                 raise ValueError(
                     f"Some analysis accession could not be found for analyses "
                     f"{', '.join(self._preset_analysis_accessions)} in metadata DB."
                 )
     else:
         with self.metadata_connection_handle as conn:
             query = (
                 f"select distinct b.analysis_accession from project_analysis a "
                 f"join analysis b on a.analysis_accession=b.analysis_accession "
                 f"where a.project_accession='{self.project_accession}' and b.hidden_in_eva=0;"
             )
             rows = get_all_results_for_query(conn, query)
             if len(rows) == 0:
                 raise ValueError(
                     f'No analysis accession could be found for project {self.project_accession} '
                     f'in metadata DB.')
     return [row[0] for row in rows]
def get_assembly_list_for_taxonomy(private_config_xml_file, taxonomy, release_version):
    assembly_list = set()
    with get_metadata_connection_handle("production", private_config_xml_file) as pg_conn:
        query = f'SELECT assembly_accession from evapro.assembly where taxonomy_id = {taxonomy}'
        for assembly in get_all_results_for_query(pg_conn, query):
            assembly_list.add(assembly[0])
        query = (f"SELECT distinct assembly_accession "
                 f"from eva_progress_tracker.clustering_release_tracker "
                 f"where taxonomy={taxonomy} and release_version={release_version} and assembly_accession!='Unmapped'")
        for assembly in get_all_results_for_query(pg_conn, query):
            assembly_list.add(assembly[0])
    return list(assembly_list)
def get_contigs_accessions_for(pg_conn, accession):
    db_query = ("select distinct contig_accession "
                "from eva_tasks.eva2150_mongo_genbank_contigs "
                "where source='%s' AND assembly_accession='%s'")

    all_eva_contigs = [
        contig for contig, in get_all_results_for_query(
            pg_conn, db_query % ('submittedVariantEntity', accession))
    ]
    all_dbsnp_contigs = [
        contig for contig, in get_all_results_for_query(
            pg_conn, db_query %
            ('dbsnpSubmittedVariantOperationEntity', accession))
    ]
    return all_eva_contigs, all_dbsnp_contigs
 def project_accession(self):
     with self.metadata_connection_handle as conn:
         query = f"select project_accession from evapro.project_eva_submission where eload_id={self.eload_num};"
         rows = get_all_results_for_query(conn, query)
     if len(rows) != 1:
         raise ValueError(f'No project accession for {self.eload} found in metadata DB.')
     return rows[0][0]
def get_assemblies_from_evapro(profile, private_config_xml_file):
    with get_metadata_connection_handle(profile, private_config_xml_file) as pg_conn:
        query = "select distinct assembly_accession from evapro.accessioned_assembly where assembly_accession like 'GCA%'" \
                " union " \
                "select distinct assembly_accession from eva_progress_tracker.remapping_tracker where assembly_accession like 'GCA%'"
        evapro_assemblies = get_all_results_for_query(pg_conn, query)
        return [asm[0] for asm in evapro_assemblies]
Esempio n. 10
0
 def get_study_name(self):
     with self.metadata_connection_handle as conn:
         query = f"SELECT title FROM evapro.project WHERE project_accession='{self.project_accession}';"
         rows = get_all_results_for_query(conn, query)
     if len(rows) != 1:
         raise ValueError(f'More than one project with accession {self.project_accession} found in metadata DB.')
     return rows[0][0]
def fill_in_table_from_remapping(private_config_xml_file, release_version,
                                 reference_directory):
    query_retrieve_info = (
        "select taxonomy, scientific_name, assembly_accession, string_agg(distinct source, ', '), sum(num_ss_ids)"
        "from eva_progress_tracker.remapping_tracker "
        f"where release_version={release_version} "
        "group by taxonomy, scientific_name, assembly_accession")
    with get_metadata_connection_handle("development",
                                        private_config_xml_file) as pg_conn:
        for taxonomy, scientific_name, assembly_accession, sources, num_ss_id in get_all_results_for_query(
                pg_conn, query_retrieve_info):
            if num_ss_id == 0:
                # Do not release species with no data
                continue

            should_be_clustered = True
            should_be_released = True
            ncbi_assembly = NCBIAssembly(assembly_accession, scientific_name,
                                         reference_directory)
            fasta_path = ncbi_assembly.assembly_fasta_path
            report_path = ncbi_assembly.assembly_report_path
            tempmongo_instance = get_tempmongo_instance(pg_conn, taxonomy)
            release_folder_name = normalise_taxon_scientific_name(
                scientific_name)
            query_insert = (
                'INSERT INTO eva_progress_tracker.clustering_release_tracker '
                '(sources, taxonomy, scientific_name, assembly_accession, release_version, should_be_clustered, '
                'fasta_path, report_path, tempmongo_instance, should_be_released, release_folder_name) '
                f"VALUES ('{sources}', {taxonomy}, '{scientific_name}', '{assembly_accession}', {release_version}, "
                f"{should_be_clustered}, '{fasta_path}', '{report_path}', '{tempmongo_instance}', {should_be_released}, "
                f"'{release_folder_name}') ON CONFLICT DO NOTHING")
            execute_query(pg_conn, query_insert)
Esempio n. 12
0
 def update_assembly_set_in_analysis(self):
     taxonomy = self.eload_cfg.query('submission', 'taxonomy_id')
     analyses = self.eload_cfg.query('submission', 'analyses')
     with self.metadata_connection_handle as conn:
         for analysis_alias, analysis_data in analyses.items():
             assembly_accession = analysis_data['assembly_accession']
             assembly_set_id = get_assembly_set(conn, taxonomy,
                                                assembly_accession)
             analysis_accession = self.eload_cfg.query(
                 'brokering', 'ena', 'ANALYSIS', analysis_alias)
             # Check if the update is needed
             check_query = (
                 f"select assembly_set_id from evapro.analysis "
                 f"where analysis_accession = '{analysis_accession}';")
             res = get_all_results_for_query(conn, check_query)
             if res and res[0][0] != assembly_set_id:
                 if res[0][0]:
                     self.error(
                         f'Previous assembly_set_id {res[0][0]} for {analysis_accession} was wrong and '
                         f'will be updated to {assembly_set_id}')
                 analysis_update = (
                     f"update evapro.analysis "
                     f"set assembly_set_id = '{assembly_set_id}' "
                     f"where analysis_accession = '{analysis_accession}';")
                 execute_query(conn, analysis_update)
Esempio n. 13
0
def find_variants_studies_eligible_for_migration(private_config_xml_file,
                                                 migration_start_time,
                                                 migration_end_time):
    with psycopg2.connect(get_pg_uri_for_variant_profile(
            "production", private_config_xml_file),
                          user="******") as metadata_connection_handle:
        query_string = f"select bjep.job_execution_id, bjep.key_name, bjep.string_val, bje.start_time \
                        from batch_job_execution bje join batch_job_execution_params bjep \
                        on bje.job_execution_id=bjep.job_execution_id \
                        where bjep.key_name in ('{study_key}', '{analysis_key}', '{mongodb_key}')  \
                        and bje.start_time between '{migration_start_time}' and '{migration_end_time}'\
                        order by bjep.job_execution_id desc , bjep.key_name"

    query_result = get_all_results_for_query(metadata_connection_handle,
                                             query_string)
    logger.info(f"\nStudies eligible for migration : {query_result}")

    job_parameter_combine = defaultdict(dict)
    for job_id, key_name, key_value, start_time in query_result:
        job_parameter_combine[job_id][key_name] = key_value

    db_study_dict = defaultdict(set)
    for key, val in job_parameter_combine.items():
        db_study_dict[val[mongodb_key]].add(
            (val[study_key], val[analysis_key]))

    return db_study_dict
Esempio n. 14
0
 def restore_table(self, table_name, where_clause):
     query = f"select * from {table_name} where {where_clause}"
     rows = get_all_results_for_query(self.prod_conn, query)
     for row in rows:
         query = f'insert into {table_name} VALUES ' + serialise_tuple(row)
         print(query)
         self.dev_cursor.execute(query)
Esempio n. 15
0
def get_assemblies_from_evapro(private_config_xml_file):
    metadata_handle = psycopg2.connect(get_pg_metadata_uri_for_eva_profile(
        "development", private_config_xml_file),
                                       user="******")
    query = "select assembly_accession from accessioned_assembly where assembly_accession like 'GCA%'"
    evapro_assemblies = get_all_results_for_query(metadata_handle, query)
    return [asm[0] for asm in evapro_assemblies]
Esempio n. 16
0
def get_project_alias(project_accession):
    with get_metadata_connection_handle(cfg['maven']['environment'], cfg['maven']['settings_file']) as conn:
        query = f"select alias from evapro.project where project_accession='{project_accession}';"
        rows = get_all_results_for_query(conn, query)
    if len(rows) != 1:
        raise ValueError(f'No project alias for {project_accession} found in metadata DB.')
    return rows[0][0]
Esempio n. 17
0
def get_assemblies_to_import_for_dbsnp_species(metadata_connection_handle, dbsnp_species_taxonomy, release_version):
    query = "select distinct assembly_accession from dbsnp_ensembl_species.release_assemblies " \
            "where release_version='{0}' " \
            "and data_source='dbSNP' and tax_id='{1}'".format(release_version, dbsnp_species_taxonomy)
    results = get_all_results_for_query(metadata_connection_handle, query)
    if len(results) > 0:
        return [result[0] for result in results]
    return []
def check_if_entry_exists_for_taxonomy_and_assembly(pg_conn, taxonomy,
                                                    assembly):
    logger.info(
        f'check if entry exists for taxonomy({taxonomy}) and assembly({assembly})'
    )
    query_check = f"""SELECT count(*) from eva_progress_tracker.clustering_release_tracker
                                 WHERE taxonomy={taxonomy} and assembly_accession='{assembly}'"""
    result = get_all_results_for_query(pg_conn, query_check)
    return True if result[0][0] != 0 else False
Esempio n. 19
0
def get_all_taxonomies_from_eva(private_config_xml_file):
    taxonomy_list = []
    with get_metadata_connection_handle("development",
                                        private_config_xml_file) as pg_conn:
        query = 'SELECT DISTINCT taxonomy_id FROM evapro.taxonomy'
        for taxonomy in get_all_results_for_query(pg_conn, query):
            taxonomy_list.append(taxonomy[0])

    return taxonomy_list
Esempio n. 20
0
 def info_for_release_version(version):
     results = get_all_results_for_query(
         metadata_connection_handle,
         "select distinct release_folder_name from {0} "
         "where taxonomy = '{1}' "
         "and release_version = {2}".format(
             release_properties.release_species_inventory_table,
             taxonomy_id, version))
     return results[0][0] if len(results) > 0 else None
def get_assemblies_with_multimap_snps_for_species(metadata_connection_handle):
    assembly_GCA_accession_map = defaultdict(dict)
    query = "select distinct database_name, assembly, assembly_accession " \
            "from dbsnp_ensembl_species.EVA2015_snpmapinfo_asm_lookup " \
            "where assembly_accession is not null"
    for result in get_all_results_for_query(metadata_connection_handle, query):
        species_name, assembly, GCA_accession = result
        assembly_GCA_accession_map[species_name][assembly] = GCA_accession
    return assembly_GCA_accession_map
def get_tempmongo_instance(pg_conn, taxonomy_id):
    if not taxonomy_tempmongo_instance:
        query = "select taxonomy, tempmongo_instance from eva_progress_tracker.clustering_release_tracker where tempmongo_instance is not null"
        for taxonomy, tempmongo_instance in get_all_results_for_query(pg_conn, query):
            taxonomy_tempmongo_instance[taxonomy] = tempmongo_instance

    if taxonomy_id not in taxonomy_tempmongo_instance:
        taxonomy_tempmongo_instance[taxonomy_id] = next(tempmongo_instances)
    return taxonomy_tempmongo_instance[taxonomy_id]
Esempio n. 23
0
def get_assemblies_and_scientific_name_from_taxonomy(taxonomy_id, metadata_connection_handle, clustering_tracking_table, release_version):
    query = (f"SELECT assembly_accession, scientific_name FROM {clustering_tracking_table} "
             f"WHERE taxonomy = '{taxonomy_id}' "
             f"and release_version = {release_version} "
             f"and assembly_accession <> 'Unmapped' "
             f"and should_be_clustered = 't'")
    results = get_all_results_for_query(metadata_connection_handle, query)
    if len(results) == 0:
        raise Exception("Could not find assemblies pertaining to taxonomy ID: " + taxonomy_id)
    return [result[0] for result in results], results[0][1]
def lookup_GCA_assembly(species_name, snpmapinfo_table_name, asm, metadata_connection_handle):
    query = "select assembly_accession from dbsnp_ensembl_species.EVA2015_snpmapinfo_asm_lookup " \
            "where database_name = '{0}' and snpmapinfo_table_name = '{1}' and assembly = '{2}' " \
            "and assembly_accession <> 'unresolved' and assembly_accession is not null"\
        .format(species_name, snpmapinfo_table_name, asm)
    results = get_all_results_for_query(metadata_connection_handle, query)
    if len(results) == 0:
        raise Exception("Could not look up GCA accession for {0} table for the species {1}"
                        .format(snpmapinfo_table_name, species_name))
    return results[0][0]
Esempio n. 25
0
def get_release_assemblies_for_taxonomy(taxonomy_id, release_species_inventory_table,
                                        release_version, metadata_connection_handle):
    results = get_all_results_for_query(metadata_connection_handle, "select assembly from {0} "
                                                                    "where taxonomy_id = '{1}' "
                                                                    "and release_version = {2} and should_be_processed "
                                                                    "and number_variants_to_process > 0"
                                        .format(release_species_inventory_table, taxonomy_id, release_version))
    if len(results) == 0:
        raise Exception("Could not find assemblies pertaining to taxonomy ID: " + taxonomy_id)
    return [result[0] for result in results]
Esempio n. 26
0
def get_most_recent_timestamp(private_config_xml_file, ftp_table_name):
    with get_metadata_connection_handle(
            'development',
            private_config_xml_file) as metadata_connection_handle:
        results = get_all_results_for_query(
            metadata_connection_handle,
            f"select max(event_ts_txt) as recent_ts from {ftp_table_name};")
        if results and results[0][0]:
            return results[0][0]
    return None
Esempio n. 27
0
def get_file_id_from_md5(md5):
    with get_metadata_connection_handle(cfg['maven']['environment'],
                                        cfg['maven']['settings_file']) as conn:
        query = f"select file_id from file where file_md5='{md5}'"
        rows = get_all_results_for_query(conn, query)
        file_ids = [file_id for file_id, in rows]
        if len(file_ids) > 1:
            raise ValueError(f'Multiple file found with md5 {md5}')
        elif len(file_ids) == 0:
            return None
        return file_ids[0]
Esempio n. 28
0
def process_one_assembly(assembly, resume):
    # Check the original remapping tracking table for the appropriate taxonomies and target assemblies
    query = (
        'SELECT taxonomy, scientific_name, assembly_accession '
        'FROM eva_progress_tracker.remapping_tracker '
        f"WHERE origin_assembly_accession='{assembly}' "
        "AND source='DBSNP'"
    )
    with get_metadata_connection_handle(cfg['maven']['environment'], cfg['maven']['settings_file']) as pg_conn:
        for taxid, scientific_name, target_assembly in get_all_results_for_query(pg_conn, query):
            process_one_taxonomy(assembly, taxid, scientific_name, target_assembly, resume)
Esempio n. 29
0
def get_tax_asm_source_from_eva(private_config_xml_file):
    eva_tax_asm = {}
    with get_metadata_connection_handle("development",
                                        private_config_xml_file) as pg_conn:
        query = f"""SELECT DISTINCT taxonomy_id, source, assembly_id FROM {remapping_genome_target_table} 
        WHERE current=TRUE"""
        for tax_id, source, assembly in get_all_results_for_query(
                pg_conn, query):
            eva_tax_asm[tax_id] = {'assembly': assembly, 'source': source}

    return eva_tax_asm
def get_snpmapinfo_table_names_for_species(species_info):
    query = """select table_name from information_schema.tables 
    where lower(table_name) like '%snpmapinfo%' and table_schema = 'dbsnp_{0}'
    """
    with get_db_conn_for_species(species_info) as pg_conn_for_species:
        results = get_all_results_for_query(pg_conn_for_species, query.format(species_info["database_name"]))
        if len(results) > 0:
            if len(results[0]) > 0:
                return [result[0] for result in results]

    return []