def run_nextflow_copy(self, project_accession=None): migrate_params = { 'eload': self.eload, 'old_eloads_dir': cfg['noah']['eloads_new_mnt'], 'new_eloads_dir': cfg['eloads_dir'], 'old_projects_dir': cfg['noah']['projects_new_mnt'], 'new_projects_dir': cfg['projects_dir'], } if project_accession: migrate_params['project_accession'] = project_accession work_dir = self.create_nextflow_temp_output_directory() params_file = os.path.join(self.eload_dir, 'migrate_params.yaml') # Use a specific log file so we don't overwrite when we sync log_file = os.path.join(self.eload_dir, 'migrate_nextflow.log') with open(params_file, 'w') as open_file: yaml.safe_dump(migrate_params, open_file) nextflow_script = os.path.join(NEXTFLOW_DIR, 'migrate.nf') try: command_utils.run_command_with_output( f'Nextflow migrate process', ' '.join( ('export NXF_OPTS="-Xms1g -Xmx8g"; ', cfg['executable']['nextflow'], '-log', log_file, 'run', nextflow_script, '-params-file', params_file, '-work-dir', work_dir))) shutil.rmtree(work_dir) except subprocess.CalledProcessError as e: raise e
def get_rs_with_non_nucleotide_letters(missing_rs_ids_file, assembly_accession, mongo_database_handle): results_from_sve_file = get_ids_from_mongo_for_category( missing_rs_ids_file, assembly_accession, mongo_database_handle, aggregate_query_to_use=get_rs_with_non_nucleotide_letters_query_SVE, rs_id_attribute_path="rs", collections_to_query=[dbsnp_sve_collection_name, sve_collection_name], attribution_category="rs_with_non_nucleotide_letters_SVE") results_from_svoe_file = get_ids_from_mongo_for_category( missing_rs_ids_file, assembly_accession, mongo_database_handle, aggregate_query_to_use=get_rs_with_non_nucleotide_letters_query_SVOE, rs_id_attribute_path="inactiveObjects.rs", collections_to_query=[ dbsnp_svoe_collection_name, svoe_collection_name ], attribution_category="rs_with_non_nucleotide_letters_SVOE") final_result_file = results_from_sve_file.replace("_SVE", "") run_command_with_output( "Concatenate SVE and SVOE results for RS IDs with non-nucleotide letters", "(cat {0} {1} | sort | uniq > {2})".format(results_from_sve_file, results_from_svoe_file, final_result_file)) return final_result_file
def export_all_multimap_snps_from_dbsnp_dumps(private_config_xml_file): result_file = "all_multimap_snp_ids_from_dbsnp_dumps.txt" with psycopg2.connect(get_pg_metadata_uri_for_eva_profile("production", private_config_xml_file), user="******") \ as metadata_connection_handle: assembly_GCA_accession_map = get_assemblies_with_multimap_snps_for_species( metadata_connection_handle) for species_info in get_species_info(metadata_connection_handle): species_name = species_info["database_name"] logger.info("Processing species {0}...".format(species_name)) if species_name in assembly_GCA_accession_map: with get_db_conn_for_species( species_info) as species_connection_handle: export_query = "select snp_id, assembly from dbsnp_{0}.multimap_snps " \ "where assembly in ({1})"\ .format(species_name,",".join(["'{0}'".format(assembly) for assembly in assembly_GCA_accession_map[species_name].keys()])) logger.info("Running export query: " + export_query) with open(result_file, 'a') as result_file_handle: for snp_id, assembly in get_result_cursor( species_connection_handle, export_query): result_file_handle.write("{0},{1}\n".format( snp_id, assembly_GCA_accession_map[species_name] [assembly])) run_command_with_output( "Sorting multimap SNP IDs from dbSNP source dumps...", "sort -u {0} -o {0}".format(result_file))
def run_release_for_assembly(private_config_xml_file, taxonomy_id, assembly_accession, release_species_inventory_table, release_version, species_release_folder, release_jar_path, job_repo_url, memory): exit_code = 0 try: port_forwarding_process_id, mongo_port = open_mongo_port_to_tempmongo( private_config_xml_file, taxonomy_id, release_species_inventory_table, release_version) release_properties_file = create_release_properties_file_for_assembly( private_config_xml_file, taxonomy_id, assembly_accession, release_species_inventory_table, release_version, species_release_folder, job_repo_url) release_command = 'java -Xmx{0}g -jar {1} --spring.config.location="{2}" -Dspring.data.mongodb.port={3}'\ .format(memory, release_jar_path, release_properties_file, mongo_port) run_command_with_output( "Running release pipeline for assembly: " + assembly_accession, release_command) except Exception as ex: logger.error( "Encountered an error while running release for assembly: " + assembly_accession + "\n" + traceback.format_exc()) exit_code = -1 finally: close_mongo_port_to_tempmongo(port_forwarding_process_id) logger.info("Java release pipeline run completed with exit_code: " + str(exit_code)) sys.exit(exit_code)
def check_submitted_variant_flanks(mongo_client, ssid): samtools = cfg.query('executable', 'samtools', ret_default='samtools') sve_collection = mongo_client['eva_accession_sharded']['dbsnpSubmittedVariantEntity'] cursor = sve_collection.find({'accession': int(ssid), 'remappedFrom': {'$exists': False}}) flank_size = 50 variant_records = list(cursor) id_2_info = {} for variant_rec in variant_records: flank_up_coord = f"{variant_rec['contig']}:{variant_rec['start'] - flank_size}-{variant_rec['start'] - 1}" flank_down_coord = f"{variant_rec['contig']}:{variant_rec['start'] + 1}-{variant_rec['start'] + flank_size}" genome_assembly_fasta = get_genome(assembly_accession=variant_rec['seq'], taxonomy=variant_rec['tax']) command = f"{samtools} faidx {genome_assembly_fasta} {flank_up_coord} | grep -v '^>' | sed 's/\\n//' " flank_up = run_command_with_output(f'Extract upstream sequence using {flank_up_coord}', command, return_process_output=True).strip().upper() command = f"{samtools} faidx {genome_assembly_fasta} {flank_down_coord} | grep -v '^>' | sed 's/\\n//' " flank_down = run_command_with_output(f'Extract downstream sequence using {flank_down_coord}', command, return_process_output=True).strip().upper() id_2_info[variant_rec['_id']] = {'variant_rec': variant_rec, 'flank_up': flank_up, 'flank_down': flank_down} for variant_id1, variant_id2 in list(itertools.combinations(id_2_info, 2)): alignment, strand = compare_variant_flanks( id_2_info[variant_id1]['flank_up'] + id_2_info[variant_id1]['variant_rec']['ref'] + id_2_info[variant_id1]['flank_down'], id_2_info[variant_id2]['flank_up'] + id_2_info[variant_id2]['variant_rec']['ref'] + id_2_info[variant_id2]['flank_down'] ) output = format_output( ssid, id_2_info[variant_id1]['variant_rec'], id_2_info[variant_id2]['variant_rec'], alignment, strand, id_2_info[variant_id1]['flank_up'], id_2_info[variant_id1]['flank_down'], id_2_info[variant_id2]['flank_up'], id_2_info[variant_id2]['flank_down'] ) print(output)
def get_multimap_snps_from_mongo(private_config_xml_file, collection_to_validate): # Dirty hack: since mongoexport does not allow switching databases # replace admin in the URI with the database name and relegate admin to authSource production_mongo_uri = get_mongo_uri_for_eva_profile("production", private_config_xml_file) \ .replace("/admin", "/eva_accession_sharded?authSource=admin") output_file = collection_to_validate + "_multimap_snp_ids.txt" accession_attribute = collection_attribute_paths[collection_to_validate][ "rs_accession_attribute_name"].replace("inactiveObjects.", "inactiveObjects.0.") assembly_attribute = collection_attribute_paths[collection_to_validate][ "assembly_attribute_name"].replace("inactiveObjects.", "inactiveObjects.0.") export_command = 'mongoexport --uri "{0}" --collection {1} --type=csv --fields \'{2},{3}\' ' \ '--query \'{{"{4}": {{$exists: true}}}}\' --noHeaderLine --out {5}' \ .format(production_mongo_uri, collection_to_validate, accession_attribute, assembly_attribute, collection_attribute_paths[collection_to_validate]["mapping_weight_attribute_path"] .replace("$.", ""), output_file) # Mongoexport is one of those brain-damaged commands that outputs progress to stderr. # So, log error stream to output. run_command_with_output("Export multimap SNP IDs in collection: " + collection_to_validate, export_command, log_error_stream_to_output=True) run_command_with_output( "Sorting multimap SNP IDs from collection: " + collection_to_validate, "sort -u {0} -o {0}".format(output_file)) return output_file
def _run_validation_workflow(self): output_dir = self.create_nextflow_temp_output_directory() validation_config = { 'metadata_file': self.eload_cfg.query('submission', 'metadata_spreadsheet'), 'vcf_files': self.eload_cfg.query('submission', 'vcf_files'), 'reference_fasta': self.eload_cfg.query('submission', 'assembly_fasta'), 'reference_report': self.eload_cfg.query('submission', 'assembly_report'), 'output_dir': output_dir, 'executable': cfg['executable'] } # run the validation validation_confg_file = os.path.join(self.eload_dir, 'validation_confg_file.yaml') with open(validation_confg_file, 'w') as open_file: yaml.safe_dump(validation_config, open_file) validation_script = os.path.join(ROOT_DIR, 'nextflow', 'validation.nf') try: command_utils.run_command_with_output( 'Nextflow Validation process', ' '.join(( 'export NXF_OPTS="-Xms1g -Xmx8g"; ', cfg['executable']['nextflow'], validation_script, '-params-file', validation_confg_file, '-work-dir', output_dir )) ) except subprocess.CalledProcessError: self.error('Nextflow pipeline failed: results might not be complete') return output_dir
def hardlink_to_previous_release_assembly_files_in_ftp( current_release_assembly_info, release_properties): assembly_accession = current_release_assembly_info["assembly_accession"] public_current_release_assembly_folder = \ get_folder_path_for_assembly(release_properties.public_ftp_current_release_folder, assembly_accession) public_previous_release_assembly_folder = \ get_folder_path_for_assembly(release_properties.public_ftp_previous_release_folder, assembly_accession) if os.path.exists(public_previous_release_assembly_folder): recreate_public_release_assembly_folder( assembly_accession, public_current_release_assembly_folder) for filename in get_release_file_list_for_assembly( current_release_assembly_info) + ["md5checksums.txt"]: file_to_hardlink = "{0}/{1}".format( public_previous_release_assembly_folder, filename) if os.path.exists(file_to_hardlink): run_command_with_output( "Creating hardlink from previous release assembly folder {0} " "to current release assembly folder {1}".format( public_current_release_assembly_folder, public_previous_release_assembly_folder), 'ln -f {0} {1}'.format( file_to_hardlink, public_current_release_assembly_folder)) else: raise Exception( "Previous release folder {0} does not exist for assembly!".format( public_previous_release_assembly_folder))
def publish_assembly_release_files_to_ftp(current_release_assembly_info, release_properties): assembly_accession = current_release_assembly_info["assembly_accession"] public_release_assembly_folder = \ get_folder_path_for_assembly(release_properties.public_ftp_current_release_folder, assembly_accession) # If a species was processed during this release, copy current release data to FTP if current_release_assembly_info["should_be_released"] and \ current_release_assembly_info["num_rs_to_release"] > 0: copy_current_assembly_data_to_ftp(current_release_assembly_info, release_properties, public_release_assembly_folder) else: # Since the assembly data is unchanged from the last release, hard-link instead of symlink to older release data # so that deleting data in older releases does not impact the newer releases # (hard-linking preserves the underlying data for a link until all links to that data are deleted) hardlink_to_previous_release_assembly_files_in_ftp( current_release_assembly_info, release_properties) # Symlink to release README_general_info file - See layout in the link below: # https://docs.google.com/presentation/d/1cishRa6P6beIBTP8l1SgJfz71vQcCm5XLmSA8Hmf8rw/edit#slide=id.g63fd5cd489_0_0 run_command_with_output( "Symlinking to release level {0} and {1} files for assembly {1}". format(readme_general_info_file, readme_known_issues_file, assembly_accession), 'bash -c "cd {1} && ln -sfT {0}/{2} {1}/{2} && ln -sfT {0}/{3} {1}/{3}"' .format( os.path.relpath( release_properties.public_ftp_current_release_folder, public_release_assembly_folder), public_release_assembly_folder, readme_general_info_file, readme_known_issues_file)) # Create a link from species folder ex: by_species/ovis_aries to point to this assembly folder create_symlink_to_assembly_folder_from_species_folder( current_release_assembly_info, release_properties, public_release_assembly_folder)
def copy_current_assembly_data_to_ftp(current_release_assembly_info, release_properties, public_release_assembly_folder): assembly_accession = current_release_assembly_info["assembly_accession"] species_release_folder_name = current_release_assembly_info[ "release_folder_name"] md5sum_output_file = os.path.join(public_release_assembly_folder, "md5checksums.txt") run_command_with_output( "Removing md5 checksum file {0} for assembly if it exists...".format( md5sum_output_file), "rm -f " + md5sum_output_file) recreate_public_release_assembly_folder(assembly_accession, public_release_assembly_folder) for filename in get_release_file_list_for_assembly( current_release_assembly_info): source_file_path = os.path.join( release_properties.staging_release_folder, species_release_folder_name, assembly_accession, filename) run_command_with_output( "Copying {0} to {1}...".format(filename, public_release_assembly_folder), "cp {0} {1}".format(source_file_path, public_release_assembly_folder)) if filename.endswith(release_file_types_to_be_checksummed): md5sum_output = run_command_with_output( "Checksumming file {0}...".format(filename), "(md5sum {0} | awk '{{ print $1 }}')".format(source_file_path), return_process_output=True) open(md5sum_output_file, "a").write(md5sum_output.strip() + "\t" + os.path.basename(source_file_path) + "\n")
def count_rs_ids_in_release_files(count_ids_script_path, assembly_accession, species_release_folder): release_count_filename = os.path.join(species_release_folder, assembly_accession, "README_rs_ids_counts.txt") with open(release_count_filename, "w") as release_count_file_handle: release_count_file_handle.write("# Unique RS ID counts\n") for vcf_file_category in release_vcf_file_categories: release_vcf_file_name = get_release_vcf_file_name( species_release_folder, assembly_accession, vcf_file_category) num_ids_in_file = run_command_with_output( "Counting RS IDs in file: " + release_vcf_file_name, "{0} {1}.gz".format(count_ids_script_path, release_vcf_file_name), return_process_output=True) release_count_file_handle.write(num_ids_in_file) for text_release_file_category in release_text_file_categories: text_release_file_name = get_release_text_file_name( species_release_folder, assembly_accession, text_release_file_category) num_ids_in_file = run_command_with_output( "Counting RS IDs in file: " + text_release_file_name, "zcat {0}.gz | cut -f1 | uniq | wc -l".format( text_release_file_name), return_process_output=True) release_count_file_handle.write("{0}.gz\t{1}".format( os.path.basename(text_release_file_name), str(num_ids_in_file)))
def cluster_one(source, vcf_file, project_accession, assembly_accession, private_config_xml_file, profile, output_directory, clustering_artifact, only_printing, memory, dependency): properties_path = create_properties_file(source, vcf_file, project_accession, assembly_accession, private_config_xml_file, profile, output_directory) command = generate_bsub_command(assembly_accession, properties_path, clustering_artifact, memory, dependency) if not only_printing: run_command_with_output('Run clustering command', command, return_process_output=True)
def migrate_artifacts(python_path, cloudsmith_path, artifact_source_dir): # only consider directories with actual artifacts in them i.e., directories with version number names artifact_dirname_pattern = re.compile('[0-9]+\.[0-9]+.*') for dir_path, _, file_names in os.walk(artifact_source_dir): if artifact_dirname_pattern.match(os.path.basename(dir_path)): # Snapshot JARs and POMs are named in a sorted fashion but we only need the latest snapshot jar_file_list, pom_file_list = glob.glob( dir_path + "/*.jar"), glob.glob(dir_path + "/*.pom") if len(pom_file_list) > 0: pom_file_to_upload = sorted(pom_file_list)[-1] # If JAR is available for an artifact upload JAR with POM as reference # (ex: component libraries like accession-commons-mongodb) if len(jar_file_list) > 0: jar_file_to_upload = sorted(jar_file_list)[-1] try: run_command_with_output( "Migrating files {0} and {1}...".format( jar_file_to_upload, pom_file_to_upload), "{0} {1} push maven ebivariation/packages {2} --pom-file={3}" .format(python_path, cloudsmith_path, jar_file_to_upload, pom_file_to_upload)) except subprocess.CalledProcessError as ex: logger.error(ex) # If only POM is available, upload just the POM file # (ex: top-level libraries like accession-commons) else: try: run_command_with_output( "Migrating files {0}...".format( pom_file_to_upload), "{0} {1} push maven ebivariation/packages {2}". format(python_path, cloudsmith_path, pom_file_to_upload)) except subprocess.CalledProcessError as ex: logger.error(ex)
def _run_brokering_prep_workflow(self): output_dir = self.create_nextflow_temp_output_directory() brokering_config = { 'vcf_files': self._get_valid_vcf_files(), 'output_dir': output_dir, 'executable': cfg['executable'] } # run the validation brokering_config_file = os.path.join(self.eload_dir, 'brokering_config_file.yaml') with open(brokering_config_file, 'w') as open_file: yaml.safe_dump(brokering_config, open_file) validation_script = os.path.join(NEXTFLOW_DIR, 'prepare_brokering.nf') try: command_utils.run_command_with_output( 'Nextflow brokering preparation process', ' '.join(( cfg['executable']['nextflow'], validation_script, '-params-file', brokering_config_file, '-work-dir', output_dir )) ) except subprocess.CalledProcessError as e: self.error('Nextflow pipeline failed: aborting brokering') raise e return output_dir
def sort_bgzip_tabix_release_files(bgzip_path, tabix_path, vcf_sort_script_path, assembly_accession, species_release_folder): commands = [] # These files are left behind by the sort_vcf_sorted_chromosomes.sh script # To be idempotent, remove such files commands.append("rm -f {0}/{1}/*.chromosomes".format(species_release_folder, assembly_accession)) for vcf_file_category in release_vcf_file_categories: unsorted_release_file_name = get_unsorted_release_vcf_file_name(species_release_folder, assembly_accession, vcf_file_category) sorted_release_file_name = get_release_vcf_file_name(species_release_folder, assembly_accession, vcf_file_category) commands.append("rm -f {2} && {0} -f {1} {2}".format(vcf_sort_script_path, unsorted_release_file_name, sorted_release_file_name)) commands.extend(get_bgzip_tabix_commands_for_file(bgzip_path, tabix_path, sorted_release_file_name)) for text_release_file_category in release_text_file_categories: unsorted_release_file_name = get_unsorted_release_text_file_name(species_release_folder, assembly_accession, text_release_file_category) sorted_release_file_name = get_release_text_file_name(species_release_folder, assembly_accession, text_release_file_category) commands.append("(sort -V {1} | uniq > {2})".format(vcf_sort_script_path, unsorted_release_file_name, sorted_release_file_name)) commands.append("(gzip < {0} > {0}.gz)".format(sorted_release_file_name)) command = " && ".join(commands) run_command_with_output("Sort, bgzip and tabix release files for assembly: " + assembly_accession, command)
def download_assembly_fasta(self, overwrite=False): if not os.path.isfile(self.assembly_fasta_path) or overwrite: self._download_file(self.assembly_compressed_fasta_path, self.assembly_fasta_url) run_command_with_output( 'Uncompress {}'.format(self.assembly_compressed_fasta_path), 'gunzip -f {}'.format(self.assembly_compressed_fasta_path))
def merge_dbsnp_eva_vcf_headers(file1, file2, output_file): import tempfile run_command_with_output( "Removing output file {0} if it already exists...".format(output_file), "rm -f " + output_file) working_folder = os.path.dirname(file1) # Write content for each meta info category in the header to a specific temp file metainfo_category_tempfile_map = collections.OrderedDict([ ("fileformat", None), ("info", None), ("contig", None), ("reference", None) ]) for category in metainfo_category_tempfile_map.keys(): metainfo_category_tempfile_map[category] = open( tempfile.mktemp(prefix=category, dir=working_folder), "a+") with open(file1) as file1_handle, open(file2) as file2_handle: for file_handle in [file1_handle, file2_handle]: for line in file_handle: if line.startswith("##"): metainfo_category = line.split("=")[0].split( "##")[-1].lower() metainfo_category_tempfile_map[metainfo_category].write( line) else: break for metainfo_category, tempfile_handle in metainfo_category_tempfile_map.items( ): tempfile_handle.flush() # Sorting needs to happen by ID field for the headers # ex: ##contig=<ID=1,accession="CM000994.2"> run_command_with_output( "Merging header section ##{0} ...".format(metainfo_category), "sort -t ',' -k1 -V {0} | uniq >> {1}".format( tempfile_handle.name, output_file)) tempfile_handle.close() os.remove(tempfile_handle.name)
def _run_validation_workflow(self): output_dir = self.create_nextflow_temp_output_directory() vcf_files_mapping_csv = self._generate_csv_mappings() validation_config = { 'vcf_files_mapping': vcf_files_mapping_csv, 'output_dir': output_dir, 'executable': cfg['executable'] } # run the validation validation_confg_file = os.path.join(self.eload_dir, 'validation_confg_file.yaml') with open(validation_confg_file, 'w') as open_file: yaml.safe_dump(validation_config, open_file) validation_script = os.path.join(NEXTFLOW_DIR, 'validation.nf') try: command_utils.run_command_with_output( 'Nextflow Validation process', ' '.join( ('export NXF_OPTS="-Xms1g -Xmx8g"; ', cfg['executable']['nextflow'], validation_script, '-params-file', validation_confg_file, '-work-dir', output_dir))) except subprocess.CalledProcessError: self.error( 'Nextflow pipeline failed: results might not be complete') return output_dir
def merge_dbsnp_eva_release_files(private_config_xml_file, profile, bgzip_path, bcftools_path, vcf_sort_script_path, taxonomy_id, assembly_accession, release_species_inventory_table, release_version, species_release_folder): with psycopg2.connect(get_pg_metadata_uri_for_eva_profile(profile, private_config_xml_file), user="******") \ as metadata_connection_handle: release_info = get_release_inventory_info_for_assembly( taxonomy_id, assembly_accession, release_species_inventory_table, release_version, metadata_connection_handle) merge_commands = [] for vcf_file_category in release_vcf_file_categories: merge_commands.extend( merge_dbsnp_eva_vcf_files(bgzip_path, bcftools_path, vcf_sort_script_path, assembly_accession, species_release_folder, vcf_file_category, release_info["sources"])) for text_release_file_category in release_text_file_categories: merge_commands.extend( merge_dbsnp_eva_text_files(assembly_accession, species_release_folder, text_release_file_category, release_info["sources"])) final_merge_command = " && ".join(merge_commands) run_command_with_output( "Merging dbSNP and EVA release files for assembly: " + assembly_accession, final_merge_command)
def load_from_ena(self): """ Loads project metadata from ENA into EVADEV. """ try: command_utils.run_command_with_output( 'Load metadata from ENA to EVADEV', ' '.join(( 'perl', cfg['executable']['load_from_ena'], '-p', self.project_accession, # Current submission process never changes -c or -v '-c', 'submitted', '-v', '1', # -l is only checked for when -c=eva_value_added, so in reality never used '-l', self._get_dir('scratch'), '-e', str(self.eload_num)))) self.eload_cfg.set(self.config_section, 'ena_load', value='success') except subprocess.CalledProcessError as e: self.error('ENA metadata load failed: aborting ingestion.') self.eload_cfg.set(self.config_section, 'ena_load', value='failure') raise e
def run_variant_load_workflow(self, vep_version, vep_cache_version, skip_annotation, vcf_files_to_ingest): output_dir = self.create_nextflow_temp_output_directory(base=self.project_dir) job_props = variant_load_props_template( project_accession=self.project_accession, aggregation=self.eload_cfg.query(self.config_section, 'aggregation'), study_name=self.get_study_name(), output_dir=self.project_dir.joinpath(project_dirs['transformed']), annotation_dir=self.project_dir.joinpath(project_dirs['annotation']), stats_dir=self.project_dir.joinpath(project_dirs['stats']), vep_species=self.get_vep_species(), vep_version=vep_version, vep_cache_version=vep_cache_version, annotation_skip=skip_annotation ) if skip_annotation is False and vep_version is None: coll_name = job_props['db.collections.annotations.name'] vep = get_vep_and_vep_cache_version_from_db(self.mongo_uri, self.eload_cfg.query(self.config_section, 'database', 'db_name'), coll_name) vep_version = vep['vep_version'] vep_cache_version = vep['vep_cache_version'] if not vep_version or not vep_cache_version: raise Exception(f'No vep_version and vep_cache_version provided by user and none could be found in DB.' f'In case you want to process without annotation, please use --skip_annotation parameter.') self.eload_cfg.set(self.config_section, 'variant_load', 'vep', 'version', value=vep_version) self.eload_cfg.set(self.config_section, 'variant_load', 'vep', 'cache_version', value=vep_cache_version) job_props.update({ 'app.vep.version': vep_version, 'app.vep.cache.version': vep_cache_version, 'annotation.skip': True if vep_cache_version is None else False }) load_config = { 'valid_vcfs': vcf_files_to_ingest, 'aggregation_type': self.eload_cfg.query(self.config_section, 'aggregation'), 'load_job_props': job_props, 'project_accession': self.project_accession, 'project_dir': str(self.project_dir), 'logs_dir': os.path.join(self.project_dir, project_dirs['logs']), 'eva_pipeline_props': cfg['eva_pipeline_props'], 'executable': cfg['executable'], 'jar': cfg['jar'], } load_config_file = os.path.join(self.project_dir, 'load_config_file.yaml') with open(load_config_file, 'w') as open_file: yaml.safe_dump(load_config, open_file) variant_load_script = os.path.join(NEXTFLOW_DIR, 'variant_load.nf') try: command_utils.run_command_with_output( 'Nextflow Variant Load process', ' '.join(( 'export NXF_OPTS="-Xms1g -Xmx8g"; ', cfg['executable']['nextflow'], variant_load_script, '-params-file', load_config_file, '-work-dir', output_dir )) ) except subprocess.CalledProcessError as e: self.error('Nextflow variant load pipeline failed: results might not be complete') self.error(f"See Nextflow logs in {self.eload_dir}/.nextflow.log or pipeline logs " f"in {self.project_dir.joinpath(project_dirs['logs'])} for more details.") raise e return output_dir
def run_count_script(script_name, species_dir, metric_id): log_file = f'{os.path.basename(species_dir)}_count_{metric_id}_rsid.log' if not os.path.exists(log_file): run_command_with_output( f'Run {script_name}', f'{os.path.join(shell_script_dir, script_name)} {species_dir} {metric_id}' ) return log_file
def recreate_public_release_assembly_folder(assembly_accession, public_release_assembly_folder): run_command_with_output( "Removing release folder if it exists for {0}...".format( assembly_accession), "rm -rf " + public_release_assembly_folder) run_command_with_output( "Creating release folder for {0}...".format(assembly_accession), "mkdir -p " + public_release_assembly_folder)
def run_variant_load_workflow(self): output_dir = self.create_nextflow_temp_output_directory( base=self.project_dir) job_props = variant_load_props_template( project_accession=self.project_accession, analysis_accession=self.eload_cfg.query('brokering', 'ena', 'ANALYSIS'), aggregation=self.eload_cfg.query(self.config_section, 'aggregation'), study_name=self.get_study_name(), fasta=self.eload_cfg.query('submission', 'assembly_fasta'), output_dir=self.project_dir.joinpath(project_dirs['transformed']), annotation_dir=self.project_dir.joinpath( project_dirs['annotation']), stats_dir=self.project_dir.joinpath(project_dirs['stats']), db_name=self.eload_cfg.query(self.config_section, 'database', 'db_name'), vep_species=self.get_vep_species(), vep_version=self.eload_cfg.query(self.config_section, 'variant_load', 'vep', 'version'), vep_cache_version=self.eload_cfg.query(self.config_section, 'variant_load', 'vep', 'cache_version')) load_config = { 'valid_vcfs': [str(f) for f in self.valid_vcf_filenames], # TODO implement proper merge check or get from validation 'needs_merge': self.needs_merge, 'load_job_props': job_props, 'project_accession': self.project_accession, 'project_dir': str(self.project_dir), 'logs_dir': os.path.join(self.project_dir, project_dirs['logs']), 'eva_pipeline_props': cfg['eva_pipeline_props'], 'executable': cfg['executable'], 'jar': cfg['jar'], } load_config_file = os.path.join(self.project_dir, 'load_config_file.yaml') with open(load_config_file, 'w') as open_file: yaml.safe_dump(load_config, open_file) variant_load_script = os.path.join(ROOT_DIR, 'nextflow', 'variant_load.nf') try: command_utils.run_command_with_output( 'Nextflow Variant Load process', ' '.join( ('export NXF_OPTS="-Xms1g -Xmx8g"; ', cfg['executable']['nextflow'], variant_load_script, '-params-file', load_config_file, '-work-dir', output_dir))) except subprocess.CalledProcessError as e: self.error( 'Nextflow variant load pipeline failed: results might not be complete' ) self.error( f"See Nextflow logs in {self.eload_dir}/.nextflow.log or pipeline logs " f"in {self.project_dir.joinpath(project_dirs['logs'])} for more details." ) raise e return output_dir
def setUp(self) -> None: self.test_mongo_db = MongoDatabase(uri=self.uri, db_name=self.db_name) self.dump_dir = os.path.join(self.resources_folder, self.db_name) run_command_with_output( "Drop target test database if it already exists...", f"mongo {self.db_name} " f"--eval 'db.dropDatabase()'") run_command_with_output("Import test database...", f"mongorestore --dir {self.dump_dir}")
def create_requisite_folders(release_properties): run_command_with_output( "Creating by_species folder for the current release...", "mkdir -p " + os.path.join(release_properties.public_ftp_current_release_folder, by_species_folder_name)) run_command_with_output( "Creating by_assembly folder for the current release...", "mkdir -p " + os.path.join(release_properties.public_ftp_current_release_folder, by_assembly_folder_name))
def create_species_folder(release_properties, species_current_release_folder_name): species_current_release_folder_path = \ get_folder_path_for_species(release_properties.public_ftp_current_release_folder, species_current_release_folder_name) run_command_with_output( "Creating species release folder {0}...".format( species_current_release_folder_path), "rm -rf {0} && mkdir {0}".format(species_current_release_folder_path))
def get_residual_missing_rs_ids_file(rs_still_missing_file, attributed_rs_ids_file): import tempfile run_command_with_output("Sorting residual file {0}".format(rs_still_missing_file), "sort -o {0} {0}".format(rs_still_missing_file)) run_command_with_output("Sorting attributed RS ID file {0}".format(attributed_rs_ids_file), "sort -o {0} {0}".format(attributed_rs_ids_file)) _, temp_residual_file = tempfile.mkstemp(dir=os.path.dirname(rs_still_missing_file)) file_diff(rs_still_missing_file, attributed_rs_ids_file, FileDiffOption.NOT_IN, output_file_path=temp_residual_file) shutil.move(temp_residual_file, rs_still_missing_file) return rs_still_missing_file
def run_nextflow(self, workflow_name, params, resume): """ Runs a Nextflow workflow using the provided parameters. This will create a Nextflow work directory and delete it if the process completes successfully. If the process fails, the work directory is preserved and the process can be resumed. """ work_dir = None if resume: work_dir = self.eload_cfg.query(self.config_section, workflow_name, 'nextflow_dir') if work_dir == self.nextflow_complete_value: self.info( f'Nextflow {workflow_name} pipeline already completed, skipping.' ) return if not work_dir or not os.path.exists(work_dir): self.warning( f'Work directory for {workflow_name} not found, will start from scratch.' ) work_dir = None if not resume or not work_dir: work_dir = self.create_nextflow_temp_output_directory( base=self.project_dir) self.eload_cfg.set(self.config_section, workflow_name, 'nextflow_dir', value=work_dir) params_file = os.path.join(self.project_dir, f'{workflow_name}_params.yaml') with open(params_file, 'w') as open_file: yaml.safe_dump(params, open_file) nextflow_script = os.path.join(NEXTFLOW_DIR, f'{workflow_name}.nf') try: command_utils.run_command_with_output( f'Nextflow {workflow_name} process', ' '.join( ('export NXF_OPTS="-Xms1g -Xmx8g"; ', cfg['executable']['nextflow'], nextflow_script, '-params-file', params_file, '-work-dir', work_dir, '-resume' if resume else ''))) shutil.rmtree(work_dir) self.eload_cfg.set(self.config_section, str(workflow_name), 'nextflow_dir', value=self.nextflow_complete_value) except subprocess.CalledProcessError as e: error_msg = f'Nextflow {workflow_name} pipeline failed: results might not be complete.' error_msg += ( f"See Nextflow logs in {self.eload_dir}/.nextflow.log or pipeline logs " f"in {self.project_dir.joinpath(project_dirs['logs'])} for more details." ) self.error(error_msg) raise e
def file_diff(file1_path: str, file2_path: str, diff_option: FileDiffOption, output_file_path: str): if diff_option == FileDiffOption.NOT_IN: run_command_with_output( "Finding entries in {0} not in {1}".format(file1_path, file2_path), "comm -23 {0} {1} > {2}".format(file1_path, file2_path, output_file_path)) elif diff_option == FileDiffOption.COMMON: run_command_with_output( "Finding entries common to {0} and {1}".format( file1_path, file2_path), "comm -12 {0} {1} > {2}".format(file1_path, file2_path, output_file_path))