Ejemplo n.º 1
0
def main(assembly_accession, species_name, output_directory,
         private_config_file, clear):
    private_config_args = get_args_from_private_config_file(
        private_config_file)
    eutils_api_key = private_config_args['eutils_api_key']
    assembly = NCBIAssembly(assembly_accession,
                            species_name,
                            output_directory,
                            eutils_api_key=eutils_api_key)
    assembly.download_or_construct(overwrite=clear)
Ejemplo n.º 2
0
def get_genome_fasta_and_report(species_name,
                                assembly_accession,
                                output_directory=None,
                                overwrite=False):
    output_directory = output_directory or cfg.query('genome_downloader',
                                                     'output_directory')
    assembly = NCBIAssembly(assembly_accession,
                            species_name,
                            output_directory,
                            eutils_api_key=cfg['eutils_api_key'])
    if not os.path.isfile(assembly.assembly_fasta_path) or not os.path.isfile(
            assembly.assembly_report_path) or overwrite:
        assembly.download_or_construct(overwrite=overwrite)
    return assembly.assembly_fasta_path, assembly.assembly_report_path
Ejemplo n.º 3
0
def download_assembly(scientific_name,
                      assembly_accession,
                      download_dir,
                      assembly_report=None):
    private_json = os.path.join(eva_accession_path, "private-config.json")
    with open(private_json) as private_config_file_handle:
        config = json.load(private_config_file_handle)
        eutils_api_key = config['eutils_api_key']

    assembly = NCBIAssembly(assembly_accession, scientific_name, download_dir,
                            eutils_api_key)
    if assembly_report:
        shutil.copyfile(assembly_report, assembly.assembly_report_path)
    assembly.download_or_construct()
    return assembly.assembly_fasta_path, assembly.assembly_report_path
def fill_in_table_from_remapping(private_config_xml_file, release_version,
                                 reference_directory):
    query_retrieve_info = (
        "select taxonomy, scientific_name, assembly_accession, string_agg(distinct source, ', '), sum(num_ss_ids)"
        "from eva_progress_tracker.remapping_tracker "
        f"where release_version={release_version} "
        "group by taxonomy, scientific_name, assembly_accession")
    with get_metadata_connection_handle("development",
                                        private_config_xml_file) as pg_conn:
        for taxonomy, scientific_name, assembly_accession, sources, num_ss_id in get_all_results_for_query(
                pg_conn, query_retrieve_info):
            if num_ss_id == 0:
                # Do not release species with no data
                continue

            should_be_clustered = True
            should_be_released = True
            ncbi_assembly = NCBIAssembly(assembly_accession, scientific_name,
                                         reference_directory)
            fasta_path = ncbi_assembly.assembly_fasta_path
            report_path = ncbi_assembly.assembly_report_path
            tempmongo_instance = get_tempmongo_instance(pg_conn, taxonomy)
            release_folder_name = normalise_taxon_scientific_name(
                scientific_name)
            query_insert = (
                'INSERT INTO eva_progress_tracker.clustering_release_tracker '
                '(sources, taxonomy, scientific_name, assembly_accession, release_version, should_be_clustered, '
                'fasta_path, report_path, tempmongo_instance, should_be_released, release_folder_name) '
                f"VALUES ('{sources}', {taxonomy}, '{scientific_name}', '{assembly_accession}', {release_version}, "
                f"{should_be_clustered}, '{fasta_path}', '{report_path}', '{tempmongo_instance}', {should_be_released}, "
                f"'{release_folder_name}') ON CONFLICT DO NOTHING")
            execute_query(pg_conn, query_insert)
def insert_new_entry_for_taxonomy_assembly(pg_conn, sources, rs_count, release_version, taxonomy, assembly, reference_directory):
    logger.info(f'inserting rs count({rs_count}) for taxonomy({taxonomy}) and assembly({assembly})')
    scientific_name = get_scientific_name(pg_conn, taxonomy)
    release_folder_name = normalise_taxon_scientific_name(scientific_name)
    ncbi_assembly = NCBIAssembly(assembly, scientific_name, reference_directory)
    fasta_path = ncbi_assembly.assembly_fasta_path
    report_path = ncbi_assembly.assembly_report_path
    tempmongo_instance = get_tempmongo_instance(pg_conn, taxonomy)
    should_be_clustered = False
    should_be_released = True
    query_insert = (
        'INSERT INTO eva_progress_tracker.clustering_release_tracker '
        '(sources, taxonomy, scientific_name, assembly_accession, release_version, should_be_clustered, '
        'fasta_path, report_path, tempmongo_instance, should_be_released, release_folder_name) '
        f"VALUES ('{sources}', {taxonomy}, '{scientific_name}', '{assembly}', {release_version}, "
        f"{should_be_clustered}, '{fasta_path}', '{report_path}', '{tempmongo_instance}', {should_be_released}, "
        f"'{release_folder_name}') ON CONFLICT DO NOTHING")
    execute_query(pg_conn, query_insert)
Ejemplo n.º 6
0
def _does_contig_exist_in_assembly(contig_accession: str,
                                   assembly_accession: str):
    logger.info(f"Obtaining assembly report for {assembly_accession}...")
    asm = NCBIAssembly(assembly_accession,
                       species_scientific_name=None,
                       reference_directory=None)
    try:
        assembly_report_file_name = os.path.basename(asm.assembly_report_url)
        os.system("rm -f " + assembly_report_file_name)
        wget.download(asm.assembly_report_url)
        output = run_command_with_output(
            f"Checking if contig {contig_accession} exists in assembly {assembly_accession}",
            f'grep -w "{contig_accession}" "{assembly_report_file_name}" | cat',
            return_process_output=True)
        return output.strip() != ""
    except Exception as ex:
        logger.error(
            f"Could not download assembly report for {assembly_accession} due to: "
            + ex.__str__())
        return False
Ejemplo n.º 7
0
def collect_assembly_report_genbank_contigs(private_config_xml_file,
                                            assembly_accession):
    try:
        with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("development", private_config_xml_file),
                              user="******") \
                as metadata_connection_handle:
            asm = NCBIAssembly(assembly_accession,
                               species_scientific_name=None,
                               reference_directory=None)
            assembly_report_file_name = os.path.basename(
                asm.assembly_report_url)
            os.system("rm -f " + assembly_report_file_name)
            wget.download(asm.assembly_report_url)

            insert_chunk_size = 100
            contig_info_list = []
            for line in open(assembly_report_file_name, 'r'):
                if not line.strip().startswith("#"):
                    line_components = line.strip().split("\t")
                    chromosome_name, genbank_accession, accession_equivalence, refseq_accession = \
                        line_components[0], line_components[4], line_components[5], line_components[6]
                    # Equivalence "Relationship" column in the assembly report indicates if
                    # Genbank and RefSeq contig accessions are equivalent
                    is_equivalent_genbank_available = (
                        accession_equivalence.strip() == "=")
                    contig_info_list.append(
                        (assembly_accession, genbank_accession,
                         chromosome_name, is_equivalent_genbank_available,
                         refseq_accession))
                    if len(contig_info_list) == insert_chunk_size:
                        insert_contigs_to_db(metadata_connection_handle,
                                             contig_info_list)
                        contig_info_list = []
            insert_contigs_to_db(metadata_connection_handle, contig_info_list)
    except Exception:
        logger.error(traceback.format_exc())
Ejemplo n.º 8
0
def get_assembly_report_url(assembly_accession):
    return NCBIAssembly(assembly_accession).assembly_report_url