def run_nextflow_copy(self, project_accession=None):
        migrate_params = {
            'eload': self.eload,
            'old_eloads_dir': cfg['noah']['eloads_new_mnt'],
            'new_eloads_dir': cfg['eloads_dir'],
            'old_projects_dir': cfg['noah']['projects_new_mnt'],
            'new_projects_dir': cfg['projects_dir'],
        }
        if project_accession:
            migrate_params['project_accession'] = project_accession
        work_dir = self.create_nextflow_temp_output_directory()
        params_file = os.path.join(self.eload_dir, 'migrate_params.yaml')
        # Use a specific log file so we don't overwrite when we sync
        log_file = os.path.join(self.eload_dir, 'migrate_nextflow.log')

        with open(params_file, 'w') as open_file:
            yaml.safe_dump(migrate_params, open_file)
        nextflow_script = os.path.join(NEXTFLOW_DIR, 'migrate.nf')

        try:
            command_utils.run_command_with_output(
                f'Nextflow migrate process', ' '.join(
                    ('export NXF_OPTS="-Xms1g -Xmx8g"; ',
                     cfg['executable']['nextflow'], '-log', log_file, 'run',
                     nextflow_script, '-params-file', params_file, '-work-dir',
                     work_dir)))
            shutil.rmtree(work_dir)
        except subprocess.CalledProcessError as e:
            raise e
def get_rs_with_non_nucleotide_letters(missing_rs_ids_file, assembly_accession,
                                       mongo_database_handle):
    results_from_sve_file = get_ids_from_mongo_for_category(
        missing_rs_ids_file,
        assembly_accession,
        mongo_database_handle,
        aggregate_query_to_use=get_rs_with_non_nucleotide_letters_query_SVE,
        rs_id_attribute_path="rs",
        collections_to_query=[dbsnp_sve_collection_name, sve_collection_name],
        attribution_category="rs_with_non_nucleotide_letters_SVE")
    results_from_svoe_file = get_ids_from_mongo_for_category(
        missing_rs_ids_file,
        assembly_accession,
        mongo_database_handle,
        aggregate_query_to_use=get_rs_with_non_nucleotide_letters_query_SVOE,
        rs_id_attribute_path="inactiveObjects.rs",
        collections_to_query=[
            dbsnp_svoe_collection_name, svoe_collection_name
        ],
        attribution_category="rs_with_non_nucleotide_letters_SVOE")

    final_result_file = results_from_sve_file.replace("_SVE", "")
    run_command_with_output(
        "Concatenate SVE and SVOE results for RS IDs with non-nucleotide letters",
        "(cat {0} {1} | sort | uniq > {2})".format(results_from_sve_file,
                                                   results_from_svoe_file,
                                                   final_result_file))
    return final_result_file
Exemple #3
0
def export_all_multimap_snps_from_dbsnp_dumps(private_config_xml_file):
    result_file = "all_multimap_snp_ids_from_dbsnp_dumps.txt"
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("production", private_config_xml_file), user="******") \
        as metadata_connection_handle:
        assembly_GCA_accession_map = get_assemblies_with_multimap_snps_for_species(
            metadata_connection_handle)
        for species_info in get_species_info(metadata_connection_handle):
            species_name = species_info["database_name"]
            logger.info("Processing species {0}...".format(species_name))
            if species_name in assembly_GCA_accession_map:
                with get_db_conn_for_species(
                        species_info) as species_connection_handle:
                    export_query = "select snp_id, assembly from dbsnp_{0}.multimap_snps " \
                                   "where assembly in ({1})"\
                        .format(species_name,",".join(["'{0}'".format(assembly) for assembly in
                                                       assembly_GCA_accession_map[species_name].keys()]))
                    logger.info("Running export query: " + export_query)
                    with open(result_file, 'a') as result_file_handle:
                        for snp_id, assembly in get_result_cursor(
                                species_connection_handle, export_query):
                            result_file_handle.write("{0},{1}\n".format(
                                snp_id,
                                assembly_GCA_accession_map[species_name]
                                [assembly]))

    run_command_with_output(
        "Sorting multimap SNP IDs from dbSNP source dumps...",
        "sort -u {0} -o {0}".format(result_file))
def run_release_for_assembly(private_config_xml_file, taxonomy_id,
                             assembly_accession,
                             release_species_inventory_table, release_version,
                             species_release_folder, release_jar_path,
                             job_repo_url, memory):
    exit_code = 0
    try:
        port_forwarding_process_id, mongo_port = open_mongo_port_to_tempmongo(
            private_config_xml_file, taxonomy_id,
            release_species_inventory_table, release_version)
        release_properties_file = create_release_properties_file_for_assembly(
            private_config_xml_file, taxonomy_id, assembly_accession,
            release_species_inventory_table, release_version,
            species_release_folder, job_repo_url)
        release_command = 'java -Xmx{0}g -jar {1} --spring.config.location="{2}" -Dspring.data.mongodb.port={3}'\
            .format(memory, release_jar_path, release_properties_file, mongo_port)
        run_command_with_output(
            "Running release pipeline for assembly: " + assembly_accession,
            release_command)
    except Exception as ex:
        logger.error(
            "Encountered an error while running release for assembly: " +
            assembly_accession + "\n" + traceback.format_exc())
        exit_code = -1
    finally:
        close_mongo_port_to_tempmongo(port_forwarding_process_id)
        logger.info("Java release pipeline run completed with exit_code: " +
                    str(exit_code))
        sys.exit(exit_code)
Exemple #5
0
def check_submitted_variant_flanks(mongo_client, ssid):
    samtools = cfg.query('executable', 'samtools', ret_default='samtools')
    sve_collection = mongo_client['eva_accession_sharded']['dbsnpSubmittedVariantEntity']
    cursor = sve_collection.find({'accession': int(ssid), 'remappedFrom': {'$exists': False}})
    flank_size = 50
    variant_records = list(cursor)
    id_2_info = {}
    for variant_rec in variant_records:
        flank_up_coord = f"{variant_rec['contig']}:{variant_rec['start'] - flank_size}-{variant_rec['start'] - 1}"
        flank_down_coord = f"{variant_rec['contig']}:{variant_rec['start'] + 1}-{variant_rec['start'] + flank_size}"
        genome_assembly_fasta = get_genome(assembly_accession=variant_rec['seq'], taxonomy=variant_rec['tax'])
        command = f"{samtools} faidx {genome_assembly_fasta} {flank_up_coord} | grep -v '^>' | sed 's/\\n//' "
        flank_up = run_command_with_output(f'Extract upstream sequence using {flank_up_coord}',  command, return_process_output=True).strip().upper()
        command = f"{samtools} faidx {genome_assembly_fasta} {flank_down_coord} | grep -v '^>' | sed 's/\\n//' "
        flank_down = run_command_with_output(f'Extract downstream sequence using {flank_down_coord}',  command, return_process_output=True).strip().upper()
        id_2_info[variant_rec['_id']] = {'variant_rec': variant_rec, 'flank_up': flank_up, 'flank_down': flank_down}

    for variant_id1, variant_id2 in list(itertools.combinations(id_2_info, 2)):
        alignment, strand = compare_variant_flanks(
            id_2_info[variant_id1]['flank_up'] + id_2_info[variant_id1]['variant_rec']['ref'] + id_2_info[variant_id1]['flank_down'],
            id_2_info[variant_id2]['flank_up'] + id_2_info[variant_id2]['variant_rec']['ref'] + id_2_info[variant_id2]['flank_down']
        )
        output = format_output(
            ssid, id_2_info[variant_id1]['variant_rec'], id_2_info[variant_id2]['variant_rec'], alignment, strand,
            id_2_info[variant_id1]['flank_up'], id_2_info[variant_id1]['flank_down'],
            id_2_info[variant_id2]['flank_up'], id_2_info[variant_id2]['flank_down']
        )
        print(output)
def get_multimap_snps_from_mongo(private_config_xml_file,
                                 collection_to_validate):
    #  Dirty hack: since mongoexport does not allow switching databases
    #  replace admin in the URI with the database name and relegate admin to authSource
    production_mongo_uri = get_mongo_uri_for_eva_profile("production", private_config_xml_file) \
        .replace("/admin", "/eva_accession_sharded?authSource=admin")
    output_file = collection_to_validate + "_multimap_snp_ids.txt"
    accession_attribute = collection_attribute_paths[collection_to_validate][
        "rs_accession_attribute_name"].replace("inactiveObjects.",
                                               "inactiveObjects.0.")
    assembly_attribute = collection_attribute_paths[collection_to_validate][
        "assembly_attribute_name"].replace("inactiveObjects.",
                                           "inactiveObjects.0.")

    export_command = 'mongoexport --uri "{0}" --collection {1} --type=csv --fields \'{2},{3}\' ' \
                     '--query \'{{"{4}": {{$exists: true}}}}\' --noHeaderLine --out {5}' \
        .format(production_mongo_uri, collection_to_validate,
                accession_attribute, assembly_attribute,
                collection_attribute_paths[collection_to_validate]["mapping_weight_attribute_path"]
                .replace("$.", ""), output_file)
    # Mongoexport is one of those brain-damaged commands that outputs progress to stderr.
    # So, log error stream to output.
    run_command_with_output("Export multimap SNP IDs in collection: " +
                            collection_to_validate,
                            export_command,
                            log_error_stream_to_output=True)
    run_command_with_output(
        "Sorting multimap SNP IDs from collection: " + collection_to_validate,
        "sort -u {0} -o {0}".format(output_file))
    return output_file
 def _run_validation_workflow(self):
     output_dir = self.create_nextflow_temp_output_directory()
     validation_config = {
         'metadata_file': self.eload_cfg.query('submission', 'metadata_spreadsheet'),
         'vcf_files': self.eload_cfg.query('submission', 'vcf_files'),
         'reference_fasta': self.eload_cfg.query('submission', 'assembly_fasta'),
         'reference_report': self.eload_cfg.query('submission', 'assembly_report'),
         'output_dir': output_dir,
         'executable': cfg['executable']
     }
     # run the validation
     validation_confg_file = os.path.join(self.eload_dir, 'validation_confg_file.yaml')
     with open(validation_confg_file, 'w') as open_file:
         yaml.safe_dump(validation_config, open_file)
     validation_script = os.path.join(ROOT_DIR, 'nextflow', 'validation.nf')
     try:
         command_utils.run_command_with_output(
             'Nextflow Validation process',
             ' '.join((
                 'export NXF_OPTS="-Xms1g -Xmx8g"; ',
                 cfg['executable']['nextflow'], validation_script,
                 '-params-file', validation_confg_file,
                 '-work-dir', output_dir
             ))
         )
     except subprocess.CalledProcessError:
         self.error('Nextflow pipeline failed: results might not be complete')
     return output_dir
Exemple #8
0
def hardlink_to_previous_release_assembly_files_in_ftp(
        current_release_assembly_info, release_properties):
    assembly_accession = current_release_assembly_info["assembly_accession"]
    public_current_release_assembly_folder = \
        get_folder_path_for_assembly(release_properties.public_ftp_current_release_folder, assembly_accession)
    public_previous_release_assembly_folder = \
        get_folder_path_for_assembly(release_properties.public_ftp_previous_release_folder, assembly_accession)

    if os.path.exists(public_previous_release_assembly_folder):
        recreate_public_release_assembly_folder(
            assembly_accession, public_current_release_assembly_folder)
        for filename in get_release_file_list_for_assembly(
                current_release_assembly_info) + ["md5checksums.txt"]:
            file_to_hardlink = "{0}/{1}".format(
                public_previous_release_assembly_folder, filename)
            if os.path.exists(file_to_hardlink):
                run_command_with_output(
                    "Creating hardlink from previous release assembly folder {0} "
                    "to current release assembly folder {1}".format(
                        public_current_release_assembly_folder,
                        public_previous_release_assembly_folder),
                    'ln -f {0} {1}'.format(
                        file_to_hardlink,
                        public_current_release_assembly_folder))
    else:
        raise Exception(
            "Previous release folder {0} does not exist for assembly!".format(
                public_previous_release_assembly_folder))
Exemple #9
0
def publish_assembly_release_files_to_ftp(current_release_assembly_info,
                                          release_properties):
    assembly_accession = current_release_assembly_info["assembly_accession"]
    public_release_assembly_folder = \
        get_folder_path_for_assembly(release_properties.public_ftp_current_release_folder, assembly_accession)
    # If a species was processed during this release, copy current release data to FTP
    if current_release_assembly_info["should_be_released"] and \
            current_release_assembly_info["num_rs_to_release"] > 0:
        copy_current_assembly_data_to_ftp(current_release_assembly_info,
                                          release_properties,
                                          public_release_assembly_folder)
    else:
        # Since the assembly data is unchanged from the last release, hard-link instead of symlink to older release data
        # so that deleting data in older releases does not impact the newer releases
        # (hard-linking preserves the underlying data for a link until all links to that data are deleted)
        hardlink_to_previous_release_assembly_files_in_ftp(
            current_release_assembly_info, release_properties)

    # Symlink to release README_general_info file - See layout in the link below:
    # https://docs.google.com/presentation/d/1cishRa6P6beIBTP8l1SgJfz71vQcCm5XLmSA8Hmf8rw/edit#slide=id.g63fd5cd489_0_0
    run_command_with_output(
        "Symlinking to release level {0} and {1} files for assembly {1}".
        format(readme_general_info_file, readme_known_issues_file,
               assembly_accession),
        'bash -c "cd {1} && ln -sfT {0}/{2} {1}/{2} && ln -sfT {0}/{3} {1}/{3}"'
        .format(
            os.path.relpath(
                release_properties.public_ftp_current_release_folder,
                public_release_assembly_folder),
            public_release_assembly_folder, readme_general_info_file,
            readme_known_issues_file))
    # Create a link from species folder ex: by_species/ovis_aries to point to this assembly folder
    create_symlink_to_assembly_folder_from_species_folder(
        current_release_assembly_info, release_properties,
        public_release_assembly_folder)
Exemple #10
0
def copy_current_assembly_data_to_ftp(current_release_assembly_info,
                                      release_properties,
                                      public_release_assembly_folder):
    assembly_accession = current_release_assembly_info["assembly_accession"]
    species_release_folder_name = current_release_assembly_info[
        "release_folder_name"]
    md5sum_output_file = os.path.join(public_release_assembly_folder,
                                      "md5checksums.txt")
    run_command_with_output(
        "Removing md5 checksum file {0} for assembly if it exists...".format(
            md5sum_output_file), "rm -f " + md5sum_output_file)
    recreate_public_release_assembly_folder(assembly_accession,
                                            public_release_assembly_folder)

    for filename in get_release_file_list_for_assembly(
            current_release_assembly_info):
        source_file_path = os.path.join(
            release_properties.staging_release_folder,
            species_release_folder_name, assembly_accession, filename)
        run_command_with_output(
            "Copying {0} to {1}...".format(filename,
                                           public_release_assembly_folder),
            "cp {0} {1}".format(source_file_path,
                                public_release_assembly_folder))
        if filename.endswith(release_file_types_to_be_checksummed):
            md5sum_output = run_command_with_output(
                "Checksumming file {0}...".format(filename),
                "(md5sum {0} | awk '{{ print $1 }}')".format(source_file_path),
                return_process_output=True)
            open(md5sum_output_file,
                 "a").write(md5sum_output.strip() + "\t" +
                            os.path.basename(source_file_path) + "\n")
def count_rs_ids_in_release_files(count_ids_script_path, assembly_accession,
                                  species_release_folder):
    release_count_filename = os.path.join(species_release_folder,
                                          assembly_accession,
                                          "README_rs_ids_counts.txt")
    with open(release_count_filename, "w") as release_count_file_handle:
        release_count_file_handle.write("# Unique RS ID counts\n")
        for vcf_file_category in release_vcf_file_categories:
            release_vcf_file_name = get_release_vcf_file_name(
                species_release_folder, assembly_accession, vcf_file_category)
            num_ids_in_file = run_command_with_output(
                "Counting RS IDs in file: " + release_vcf_file_name,
                "{0} {1}.gz".format(count_ids_script_path,
                                    release_vcf_file_name),
                return_process_output=True)
            release_count_file_handle.write(num_ids_in_file)
        for text_release_file_category in release_text_file_categories:
            text_release_file_name = get_release_text_file_name(
                species_release_folder, assembly_accession,
                text_release_file_category)
            num_ids_in_file = run_command_with_output(
                "Counting RS IDs in file: " + text_release_file_name,
                "zcat {0}.gz | cut -f1 | uniq | wc -l".format(
                    text_release_file_name),
                return_process_output=True)
            release_count_file_handle.write("{0}.gz\t{1}".format(
                os.path.basename(text_release_file_name),
                str(num_ids_in_file)))
def cluster_one(source, vcf_file, project_accession, assembly_accession, private_config_xml_file,
                profile, output_directory, clustering_artifact, only_printing, memory, dependency):
    properties_path = create_properties_file(source, vcf_file, project_accession, assembly_accession,
                                             private_config_xml_file, profile, output_directory)
    command = generate_bsub_command(assembly_accession, properties_path, clustering_artifact, memory, dependency)
    if not only_printing:
        run_command_with_output('Run clustering command', command, return_process_output=True)
def migrate_artifacts(python_path, cloudsmith_path, artifact_source_dir):
    # only consider directories with actual artifacts in them i.e., directories with version number names
    artifact_dirname_pattern = re.compile('[0-9]+\.[0-9]+.*')
    for dir_path, _, file_names in os.walk(artifact_source_dir):
        if artifact_dirname_pattern.match(os.path.basename(dir_path)):
            # Snapshot JARs and POMs are named in a sorted fashion but we only need the latest snapshot
            jar_file_list, pom_file_list = glob.glob(
                dir_path + "/*.jar"), glob.glob(dir_path + "/*.pom")
            if len(pom_file_list) > 0:
                pom_file_to_upload = sorted(pom_file_list)[-1]
                # If JAR is available for an artifact upload JAR with POM as reference
                # (ex: component libraries like accession-commons-mongodb)
                if len(jar_file_list) > 0:
                    jar_file_to_upload = sorted(jar_file_list)[-1]
                    try:
                        run_command_with_output(
                            "Migrating files {0} and {1}...".format(
                                jar_file_to_upload, pom_file_to_upload),
                            "{0} {1} push maven ebivariation/packages {2} --pom-file={3}"
                            .format(python_path, cloudsmith_path,
                                    jar_file_to_upload, pom_file_to_upload))
                    except subprocess.CalledProcessError as ex:
                        logger.error(ex)
                # If only POM is available, upload just the POM file
                # (ex: top-level libraries like accession-commons)
                else:
                    try:
                        run_command_with_output(
                            "Migrating files {0}...".format(
                                pom_file_to_upload),
                            "{0} {1} push maven ebivariation/packages {2}".
                            format(python_path, cloudsmith_path,
                                   pom_file_to_upload))
                    except subprocess.CalledProcessError as ex:
                        logger.error(ex)
Exemple #14
0
 def _run_brokering_prep_workflow(self):
     output_dir = self.create_nextflow_temp_output_directory()
     brokering_config = {
         'vcf_files': self._get_valid_vcf_files(),
         'output_dir': output_dir,
         'executable': cfg['executable']
     }
     # run the validation
     brokering_config_file = os.path.join(self.eload_dir, 'brokering_config_file.yaml')
     with open(brokering_config_file, 'w') as open_file:
         yaml.safe_dump(brokering_config, open_file)
     validation_script = os.path.join(NEXTFLOW_DIR, 'prepare_brokering.nf')
     try:
         command_utils.run_command_with_output(
             'Nextflow brokering preparation process',
             ' '.join((
                 cfg['executable']['nextflow'], validation_script,
                 '-params-file', brokering_config_file,
                 '-work-dir', output_dir
             ))
         )
     except subprocess.CalledProcessError as e:
         self.error('Nextflow pipeline failed: aborting brokering')
         raise e
     return output_dir
Exemple #15
0
def sort_bgzip_tabix_release_files(bgzip_path, tabix_path, vcf_sort_script_path, assembly_accession,
                                   species_release_folder):
    commands = []
    # These files are left behind by the sort_vcf_sorted_chromosomes.sh script
    # To be idempotent, remove such files
    commands.append("rm -f {0}/{1}/*.chromosomes".format(species_release_folder, assembly_accession))
    for vcf_file_category in release_vcf_file_categories:
        unsorted_release_file_name = get_unsorted_release_vcf_file_name(species_release_folder, assembly_accession,
                                                                        vcf_file_category)
        sorted_release_file_name = get_release_vcf_file_name(species_release_folder, assembly_accession,
                                                             vcf_file_category)
        commands.append("rm -f {2} && {0} -f {1} {2}".format(vcf_sort_script_path,
                                                             unsorted_release_file_name,
                                                             sorted_release_file_name))
        commands.extend(get_bgzip_tabix_commands_for_file(bgzip_path, tabix_path, sorted_release_file_name))
    for text_release_file_category in release_text_file_categories:
        unsorted_release_file_name = get_unsorted_release_text_file_name(species_release_folder, assembly_accession,
                                                                         text_release_file_category)
        sorted_release_file_name = get_release_text_file_name(species_release_folder, assembly_accession,
                                                              text_release_file_category)
        commands.append("(sort -V {1} | uniq > {2})".format(vcf_sort_script_path,
                                                            unsorted_release_file_name,
                                                            sorted_release_file_name))
        commands.append("(gzip < {0} > {0}.gz)".format(sorted_release_file_name))
    command = " && ".join(commands)
    run_command_with_output("Sort, bgzip and tabix release files for assembly: " + assembly_accession,
                            command)
 def download_assembly_fasta(self, overwrite=False):
     if not os.path.isfile(self.assembly_fasta_path) or overwrite:
         self._download_file(self.assembly_compressed_fasta_path,
                             self.assembly_fasta_url)
         run_command_with_output(
             'Uncompress {}'.format(self.assembly_compressed_fasta_path),
             'gunzip -f {}'.format(self.assembly_compressed_fasta_path))
Exemple #17
0
def merge_dbsnp_eva_vcf_headers(file1, file2, output_file):
    import tempfile
    run_command_with_output(
        "Removing output file {0} if it already exists...".format(output_file),
        "rm -f " + output_file)
    working_folder = os.path.dirname(file1)
    # Write content for each meta info category in the header to a specific temp file
    metainfo_category_tempfile_map = collections.OrderedDict([
        ("fileformat", None), ("info", None), ("contig", None),
        ("reference", None)
    ])
    for category in metainfo_category_tempfile_map.keys():
        metainfo_category_tempfile_map[category] = open(
            tempfile.mktemp(prefix=category, dir=working_folder), "a+")
    with open(file1) as file1_handle, open(file2) as file2_handle:
        for file_handle in [file1_handle, file2_handle]:
            for line in file_handle:
                if line.startswith("##"):
                    metainfo_category = line.split("=")[0].split(
                        "##")[-1].lower()
                    metainfo_category_tempfile_map[metainfo_category].write(
                        line)
                else:
                    break
    for metainfo_category, tempfile_handle in metainfo_category_tempfile_map.items(
    ):
        tempfile_handle.flush()
        # Sorting needs to happen by ID field for the headers
        # ex: ##contig=<ID=1,accession="CM000994.2">
        run_command_with_output(
            "Merging header section ##{0} ...".format(metainfo_category),
            "sort -t ',' -k1 -V {0} | uniq >> {1}".format(
                tempfile_handle.name, output_file))
        tempfile_handle.close()
        os.remove(tempfile_handle.name)
Exemple #18
0
 def _run_validation_workflow(self):
     output_dir = self.create_nextflow_temp_output_directory()
     vcf_files_mapping_csv = self._generate_csv_mappings()
     validation_config = {
         'vcf_files_mapping': vcf_files_mapping_csv,
         'output_dir': output_dir,
         'executable': cfg['executable']
     }
     # run the validation
     validation_confg_file = os.path.join(self.eload_dir,
                                          'validation_confg_file.yaml')
     with open(validation_confg_file, 'w') as open_file:
         yaml.safe_dump(validation_config, open_file)
     validation_script = os.path.join(NEXTFLOW_DIR, 'validation.nf')
     try:
         command_utils.run_command_with_output(
             'Nextflow Validation process', ' '.join(
                 ('export NXF_OPTS="-Xms1g -Xmx8g"; ',
                  cfg['executable']['nextflow'], validation_script,
                  '-params-file', validation_confg_file, '-work-dir',
                  output_dir)))
     except subprocess.CalledProcessError:
         self.error(
             'Nextflow pipeline failed: results might not be complete')
     return output_dir
Exemple #19
0
def merge_dbsnp_eva_release_files(private_config_xml_file, profile, bgzip_path,
                                  bcftools_path, vcf_sort_script_path,
                                  taxonomy_id, assembly_accession,
                                  release_species_inventory_table,
                                  release_version, species_release_folder):
    with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(profile, private_config_xml_file), user="******") \
        as metadata_connection_handle:
        release_info = get_release_inventory_info_for_assembly(
            taxonomy_id, assembly_accession, release_species_inventory_table,
            release_version, metadata_connection_handle)
        merge_commands = []
        for vcf_file_category in release_vcf_file_categories:
            merge_commands.extend(
                merge_dbsnp_eva_vcf_files(bgzip_path, bcftools_path,
                                          vcf_sort_script_path,
                                          assembly_accession,
                                          species_release_folder,
                                          vcf_file_category,
                                          release_info["sources"]))
        for text_release_file_category in release_text_file_categories:
            merge_commands.extend(
                merge_dbsnp_eva_text_files(assembly_accession,
                                           species_release_folder,
                                           text_release_file_category,
                                           release_info["sources"]))
        final_merge_command = " && ".join(merge_commands)
        run_command_with_output(
            "Merging dbSNP and EVA release files for assembly: " +
            assembly_accession, final_merge_command)
Exemple #20
0
 def load_from_ena(self):
     """
     Loads project metadata from ENA into EVADEV.
     """
     try:
         command_utils.run_command_with_output(
             'Load metadata from ENA to EVADEV',
             ' '.join((
                 'perl',
                 cfg['executable']['load_from_ena'],
                 '-p',
                 self.project_accession,
                 # Current submission process never changes -c or -v
                 '-c',
                 'submitted',
                 '-v',
                 '1',
                 # -l is only checked for when -c=eva_value_added, so in reality never used
                 '-l',
                 self._get_dir('scratch'),
                 '-e',
                 str(self.eload_num))))
         self.eload_cfg.set(self.config_section,
                            'ena_load',
                            value='success')
     except subprocess.CalledProcessError as e:
         self.error('ENA metadata load failed: aborting ingestion.')
         self.eload_cfg.set(self.config_section,
                            'ena_load',
                            value='failure')
         raise e
Exemple #21
0
 def run_variant_load_workflow(self, vep_version, vep_cache_version, skip_annotation, vcf_files_to_ingest):
     output_dir = self.create_nextflow_temp_output_directory(base=self.project_dir)
     job_props = variant_load_props_template(
             project_accession=self.project_accession,
             aggregation=self.eload_cfg.query(self.config_section, 'aggregation'),
             study_name=self.get_study_name(),
             output_dir=self.project_dir.joinpath(project_dirs['transformed']),
             annotation_dir=self.project_dir.joinpath(project_dirs['annotation']),
             stats_dir=self.project_dir.joinpath(project_dirs['stats']),
             vep_species=self.get_vep_species(),
             vep_version=vep_version,
             vep_cache_version=vep_cache_version,
             annotation_skip=skip_annotation
     )
     if skip_annotation is False and vep_version is None:
         coll_name = job_props['db.collections.annotations.name']
         vep = get_vep_and_vep_cache_version_from_db(self.mongo_uri, self.eload_cfg.query(self.config_section, 'database', 'db_name'), coll_name)
         vep_version = vep['vep_version']
         vep_cache_version = vep['vep_cache_version']
         if not vep_version or not vep_cache_version:
             raise Exception(f'No vep_version and vep_cache_version provided by user and none could be found in DB.'
                             f'In case you want to process without annotation, please use --skip_annotation parameter.')
         self.eload_cfg.set(self.config_section, 'variant_load', 'vep', 'version', value=vep_version)
         self.eload_cfg.set(self.config_section, 'variant_load', 'vep', 'cache_version', value=vep_cache_version)
         job_props.update({
             'app.vep.version': vep_version,
             'app.vep.cache.version': vep_cache_version,
             'annotation.skip': True if vep_cache_version is None else False
         })
     load_config = {
         'valid_vcfs': vcf_files_to_ingest,
         'aggregation_type': self.eload_cfg.query(self.config_section, 'aggregation'),
         'load_job_props': job_props,
         'project_accession': self.project_accession,
         'project_dir': str(self.project_dir),
         'logs_dir': os.path.join(self.project_dir, project_dirs['logs']),
         'eva_pipeline_props': cfg['eva_pipeline_props'],
         'executable': cfg['executable'],
         'jar': cfg['jar'],
     }
     load_config_file = os.path.join(self.project_dir, 'load_config_file.yaml')
     with open(load_config_file, 'w') as open_file:
         yaml.safe_dump(load_config, open_file)
     variant_load_script = os.path.join(NEXTFLOW_DIR, 'variant_load.nf')
     try:
         command_utils.run_command_with_output(
             'Nextflow Variant Load process',
             ' '.join((
                 'export NXF_OPTS="-Xms1g -Xmx8g"; ',
                 cfg['executable']['nextflow'], variant_load_script,
                 '-params-file', load_config_file,
                 '-work-dir', output_dir
             ))
         )
     except subprocess.CalledProcessError as e:
         self.error('Nextflow variant load pipeline failed: results might not be complete')
         self.error(f"See Nextflow logs in {self.eload_dir}/.nextflow.log or pipeline logs "
                    f"in {self.project_dir.joinpath(project_dirs['logs'])} for more details.")
         raise e
     return output_dir
def run_count_script(script_name, species_dir, metric_id):
    log_file = f'{os.path.basename(species_dir)}_count_{metric_id}_rsid.log'
    if not os.path.exists(log_file):
        run_command_with_output(
            f'Run {script_name}',
            f'{os.path.join(shell_script_dir, script_name)} {species_dir} {metric_id}'
        )
    return log_file
Exemple #23
0
def recreate_public_release_assembly_folder(assembly_accession,
                                            public_release_assembly_folder):
    run_command_with_output(
        "Removing release folder if it exists for {0}...".format(
            assembly_accession), "rm -rf " + public_release_assembly_folder)
    run_command_with_output(
        "Creating release folder for {0}...".format(assembly_accession),
        "mkdir -p " + public_release_assembly_folder)
 def run_variant_load_workflow(self):
     output_dir = self.create_nextflow_temp_output_directory(
         base=self.project_dir)
     job_props = variant_load_props_template(
         project_accession=self.project_accession,
         analysis_accession=self.eload_cfg.query('brokering', 'ena',
                                                 'ANALYSIS'),
         aggregation=self.eload_cfg.query(self.config_section,
                                          'aggregation'),
         study_name=self.get_study_name(),
         fasta=self.eload_cfg.query('submission', 'assembly_fasta'),
         output_dir=self.project_dir.joinpath(project_dirs['transformed']),
         annotation_dir=self.project_dir.joinpath(
             project_dirs['annotation']),
         stats_dir=self.project_dir.joinpath(project_dirs['stats']),
         db_name=self.eload_cfg.query(self.config_section, 'database',
                                      'db_name'),
         vep_species=self.get_vep_species(),
         vep_version=self.eload_cfg.query(self.config_section,
                                          'variant_load', 'vep', 'version'),
         vep_cache_version=self.eload_cfg.query(self.config_section,
                                                'variant_load', 'vep',
                                                'cache_version'))
     load_config = {
         'valid_vcfs': [str(f) for f in self.valid_vcf_filenames],
         # TODO implement proper merge check or get from validation
         'needs_merge': self.needs_merge,
         'load_job_props': job_props,
         'project_accession': self.project_accession,
         'project_dir': str(self.project_dir),
         'logs_dir': os.path.join(self.project_dir, project_dirs['logs']),
         'eva_pipeline_props': cfg['eva_pipeline_props'],
         'executable': cfg['executable'],
         'jar': cfg['jar'],
     }
     load_config_file = os.path.join(self.project_dir,
                                     'load_config_file.yaml')
     with open(load_config_file, 'w') as open_file:
         yaml.safe_dump(load_config, open_file)
     variant_load_script = os.path.join(ROOT_DIR, 'nextflow',
                                        'variant_load.nf')
     try:
         command_utils.run_command_with_output(
             'Nextflow Variant Load process', ' '.join(
                 ('export NXF_OPTS="-Xms1g -Xmx8g"; ',
                  cfg['executable']['nextflow'], variant_load_script,
                  '-params-file', load_config_file, '-work-dir',
                  output_dir)))
     except subprocess.CalledProcessError as e:
         self.error(
             'Nextflow variant load pipeline failed: results might not be complete'
         )
         self.error(
             f"See Nextflow logs in {self.eload_dir}/.nextflow.log or pipeline logs "
             f"in {self.project_dir.joinpath(project_dirs['logs'])} for more details."
         )
         raise e
     return output_dir
 def setUp(self) -> None:
     self.test_mongo_db = MongoDatabase(uri=self.uri, db_name=self.db_name)
     self.dump_dir = os.path.join(self.resources_folder, self.db_name)
     run_command_with_output(
         "Drop target test database if it already exists...",
         f"mongo {self.db_name} "
         f"--eval 'db.dropDatabase()'")
     run_command_with_output("Import test database...",
                             f"mongorestore --dir {self.dump_dir}")
Exemple #26
0
def create_requisite_folders(release_properties):
    run_command_with_output(
        "Creating by_species folder for the current release...", "mkdir -p " +
        os.path.join(release_properties.public_ftp_current_release_folder,
                     by_species_folder_name))
    run_command_with_output(
        "Creating by_assembly folder for the current release...", "mkdir -p " +
        os.path.join(release_properties.public_ftp_current_release_folder,
                     by_assembly_folder_name))
Exemple #27
0
def create_species_folder(release_properties,
                          species_current_release_folder_name):
    species_current_release_folder_path = \
        get_folder_path_for_species(release_properties.public_ftp_current_release_folder,
                                    species_current_release_folder_name)

    run_command_with_output(
        "Creating species release folder {0}...".format(
            species_current_release_folder_path),
        "rm -rf {0} && mkdir {0}".format(species_current_release_folder_path))
Exemple #28
0
def get_residual_missing_rs_ids_file(rs_still_missing_file, attributed_rs_ids_file):
    import tempfile
    run_command_with_output("Sorting residual file {0}".format(rs_still_missing_file),
                            "sort -o {0} {0}".format(rs_still_missing_file))
    run_command_with_output("Sorting attributed RS ID file {0}".format(attributed_rs_ids_file),
                            "sort -o {0} {0}".format(attributed_rs_ids_file))
    _, temp_residual_file = tempfile.mkstemp(dir=os.path.dirname(rs_still_missing_file))
    file_diff(rs_still_missing_file, attributed_rs_ids_file, FileDiffOption.NOT_IN, output_file_path=temp_residual_file)
    shutil.move(temp_residual_file, rs_still_missing_file)
    return rs_still_missing_file
Exemple #29
0
    def run_nextflow(self, workflow_name, params, resume):
        """
        Runs a Nextflow workflow using the provided parameters.
        This will create a Nextflow work directory and delete it if the process completes successfully.
        If the process fails, the work directory is preserved and the process can be resumed.
        """
        work_dir = None
        if resume:
            work_dir = self.eload_cfg.query(self.config_section, workflow_name,
                                            'nextflow_dir')
            if work_dir == self.nextflow_complete_value:
                self.info(
                    f'Nextflow {workflow_name} pipeline already completed, skipping.'
                )
                return
            if not work_dir or not os.path.exists(work_dir):
                self.warning(
                    f'Work directory for {workflow_name} not found, will start from scratch.'
                )
                work_dir = None
        if not resume or not work_dir:
            work_dir = self.create_nextflow_temp_output_directory(
                base=self.project_dir)
            self.eload_cfg.set(self.config_section,
                               workflow_name,
                               'nextflow_dir',
                               value=work_dir)

        params_file = os.path.join(self.project_dir,
                                   f'{workflow_name}_params.yaml')
        with open(params_file, 'w') as open_file:
            yaml.safe_dump(params, open_file)
        nextflow_script = os.path.join(NEXTFLOW_DIR, f'{workflow_name}.nf')

        try:
            command_utils.run_command_with_output(
                f'Nextflow {workflow_name} process', ' '.join(
                    ('export NXF_OPTS="-Xms1g -Xmx8g"; ',
                     cfg['executable']['nextflow'], nextflow_script,
                     '-params-file', params_file, '-work-dir', work_dir,
                     '-resume' if resume else '')))
            shutil.rmtree(work_dir)
            self.eload_cfg.set(self.config_section,
                               str(workflow_name),
                               'nextflow_dir',
                               value=self.nextflow_complete_value)
        except subprocess.CalledProcessError as e:
            error_msg = f'Nextflow {workflow_name} pipeline failed: results might not be complete.'
            error_msg += (
                f"See Nextflow logs in {self.eload_dir}/.nextflow.log or pipeline logs "
                f"in {self.project_dir.joinpath(project_dirs['logs'])} for more details."
            )
            self.error(error_msg)
            raise e
def file_diff(file1_path: str, file2_path: str, diff_option: FileDiffOption,
              output_file_path: str):
    if diff_option == FileDiffOption.NOT_IN:
        run_command_with_output(
            "Finding entries in {0} not in {1}".format(file1_path, file2_path),
            "comm -23 {0} {1} > {2}".format(file1_path, file2_path,
                                            output_file_path))
    elif diff_option == FileDiffOption.COMMON:
        run_command_with_output(
            "Finding entries common to {0} and {1}".format(
                file1_path, file2_path),
            "comm -12 {0} {1} > {2}".format(file1_path, file2_path,
                                            output_file_path))