def test_do_links(self): src_tmp_dir = tempfile.mkdtemp() dst_tmp_dir = os.path.join(src_tmp_dir, 'dst' ) safe_makedir(dst_tmp_dir) src_file_path = os.path.join(src_tmp_dir, 'file1.txt') dst_file_path = os.path.join(dst_tmp_dir, 'file1.txt') open(src_file_path, 'w').close() do_hardlink([src_file_path], dst_tmp_dir) assert(filecmp.cmp(src_file_path, dst_file_path)) os.remove(dst_file_path) do_symlink([src_file_path], dst_tmp_dir) assert(filecmp.cmp(src_file_path, dst_file_path))
def test_do_links(self): src_tmp_dir = tempfile.mkdtemp() dst_tmp_dir = os.path.join(src_tmp_dir, "dst") safe_makedir(dst_tmp_dir) src_file_path = os.path.join(src_tmp_dir, "file1.txt") dst_file_path = os.path.join(dst_tmp_dir, "file1.txt") open(src_file_path, "w").close() do_hardlink([src_file_path], dst_tmp_dir) assert filecmp.cmp(src_file_path, dst_file_path) os.remove(dst_file_path) do_symlink([src_file_path], dst_tmp_dir) assert filecmp.cmp(src_file_path, dst_file_path)
def setup_analysis_directory_structure(fc_dir, projects_to_analyze, restrict_to_projects=None, restrict_to_samples=None, create_files=True, fallback_libprep=None, quiet=False, config=None, config_file_path=None): """ Copy and sort files from their CASAVA-demultiplexed flowcell structure into their respective project/sample/libPrep/FCIDs. This collects samples split across multiple flowcells. :param str fc_dir: The directory created by CASAVA for this flowcell. :param dict config: The parsed configuration file. :param set projects_to_analyze: A dict (of Project objects, or empty) :param bool create_files: Alter the filesystem (as opposed to just parsing flowcells) (default True) :param str fallback_libprep: If libprep cannot be determined, use this value if supplied (default None) :param list restrict_to_projects: Specific projects within the flowcell to process exclusively :param list restrict_to_samples: Specific samples within the flowcell to process exclusively :returns: A list of NGIProject objects that need to be run through the analysis pipeline :rtype: list :raises KeyError: If a required configuration key is not available. """ LOG.info("Setting up analysis for demultiplexed data in source folder \"{}\"".format(fc_dir)) if not restrict_to_projects: restrict_to_projects = [] if not restrict_to_samples: restrict_to_samples = [] config["quiet"] = quiet # Hack because I enter here from a script sometimes pattern="(.+(?:{}|{}))\/.+".format(config["analysis"]["sthlm_root"], config["analysis"]["upps_root"]) matches=re.match(pattern, fc_dir) if matches: flowcell_root=matches.group(1) else: LOG.error("cannot guess which project the flowcell {} belongs to".format(fc_dir)) raise RuntimeError analysis_top_dir = os.path.abspath(os.path.join(flowcell_root,config["analysis"]["top_dir"])) try: safe_makedir(analysis_top_dir) except OSError as e: LOG.error('Error: Analysis top directory {} does not exist and could not ' 'be created.'.format(analysis_top_dir)) fc_dir = fc_dir if os.path.isabs(fc_dir) else os.path.join(analysis_top_dir, fc_dir) if not os.path.exists(fc_dir): LOG.error("Error: Flowcell directory {} does not exist".format(fc_dir)) return [] # Map the directory structure for this flowcell try: fc_dir_structure = parse_flowcell(fc_dir) except (OSError, ValueError) as e: LOG.error("Error when processing flowcell dir \"{}\": {}".format(fc_dir, e)) return [] fc_full_id = fc_dir_structure['fc_full_id'] if not fc_dir_structure.get('projects'): LOG.warn("No projects found in specified flowcell directory \"{}\"".format(fc_dir)) # Iterate over the projects in the flowcell directory for project in fc_dir_structure.get('projects', []): project_name = project['project_name'] project_original_name = project['project_original_name'] samplesheet_path = fc_dir_structure.get("samplesheet_path") try: # Maps e.g. "Y.Mom_14_01" to "P123" project_id = get_project_id_from_name(project_name) except (CharonError, RuntimeError, ValueError) as e: LOG.warn('Could not retrieve project id from Charon (record missing?). ' 'Using project name ("{}") as project id ' '(error: {})'.format(project_name, e)) project_id = project_name # If specific projects are specified, skip those that do not match if restrict_to_projects and project_name not in restrict_to_projects and \ project_id not in restrict_to_projects: LOG.debug("Skipping project {} (not in restrict_to_projects)".format(project_name)) continue LOG.info("Setting up project {}".format(project.get("project_name"))) # Create a project directory if it doesn't already exist, including # intervening "DATA" directory project_dir = os.path.join(analysis_top_dir, "DATA", project_id) project_sl_dir = os.path.join(analysis_top_dir, "DATA", project_name) project_analysis_dir = os.path.join(analysis_top_dir, "ANALYSIS", project_id) project_analysis_sl_dir = os.path.join(analysis_top_dir, "ANALYSIS", project_name) if create_files: safe_makedir(project_dir, 0o2770) safe_makedir(project_analysis_dir, 0o2770) if not project_dir == project_sl_dir and \ not os.path.exists(project_sl_dir): os.symlink(project_dir, project_sl_dir) if not project_analysis_dir == project_analysis_sl_dir and \ not os.path.exists(project_analysis_sl_dir): os.symlink(project_analysis_dir, project_analysis_sl_dir) try: project_obj = projects_to_analyze[project_dir] except KeyError: project_obj = NGIProject(name=project_name, dirname=project_id, project_id=project_id, base_path=analysis_top_dir) projects_to_analyze[project_dir] = project_obj # Iterate over the samples in the project for sample in project.get('samples', []): sample_name = sample['sample_name'] # If specific samples are specified, skip those that do not match if restrict_to_samples and sample_name not in restrict_to_samples: LOG.debug("Skipping sample {}: not in specified samples " "{}".format(sample_name, ", ".join(restrict_to_samples))) continue LOG.info("Setting up sample {}".format(sample_name)) # Create a directory for the sample if it doesn't already exist sample_dir = os.path.join(project_dir, sample_name) if create_files: safe_makedir(sample_dir, 0o2770) # This will only create a new sample object if it doesn't already exist in the project sample_obj = project_obj.add_sample(name=sample_name, dirname=sample_name) # Get the Library Prep ID for each file pattern = re.compile(".*\.(fastq|fq)(\.gz|\.gzip|\.bz2)?$") fastq_files = filter(pattern.match, sample.get('files', [])) # For each fastq file, create the libprep and seqrun objects # and add the fastq file to the seqprep object # Note again that these objects only get created if they don't yet exist; # if they do exist, the existing object is returned for fq_file in fastq_files: # Try to parse from SampleSheet try: if not samplesheet_path: raise ValueError() lane_num = re.match(r'[\w-]+_L\d{2}(\d)_\w+', fq_file).groups()[0] libprep_name = determine_library_prep_from_samplesheet(samplesheet_path, project_original_name, sample_name, lane_num) except (IndexError, ValueError) as e: LOG.debug('Unable to determine library prep from sample sheet file ' '("{}"); try to determine from Charon'.format(e)) try: # Requires Charon access libprep_name = determine_library_prep_from_fcid(project_id, sample_name, fc_full_id) LOG.debug('Found libprep name "{}" in Charon'.format(libprep_name)) except ValueError: charon_session = CharonSession() libpreps = charon_session.sample_get_libpreps(project_id, sample_name).get('libpreps') if len(libpreps) == 1: libprep_name = libpreps[0].get('libprepid') LOG.warn('Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon, but only one ' 'library prep is present in Charon ("{}"). Using ' 'this as the library prep.'.format(project_name, sample_name, fc_full_id, fq_file, libprep_name)) elif fallback_libprep: libprep_name = fallback_libprep LOG.warn('Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon, but a fallback ' 'libprep value of "{}" was supplied -- using this ' 'value.'.format(project_name, sample_name, fc_full_id, fq_file, libprep_name, fallback_libprep)) else: error_text = ('Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon. Skipping ' 'analysis.'.format(project_name, sample_name, fc_full_id, fq_file)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_name, level="ERROR", info_text=error_text) continue libprep_object = sample_obj.add_libprep(name=libprep_name, dirname=libprep_name) libprep_dir = os.path.join(sample_dir, libprep_name) if create_files: safe_makedir(libprep_dir, 0o2770) seqrun_object = libprep_object.add_seqrun(name=fc_full_id, dirname=fc_full_id) seqrun_dir = os.path.join(libprep_dir, fc_full_id) if create_files: safe_makedir(seqrun_dir, 0o2770) seqrun_object.add_fastq_files(fq_file) if fastq_files and create_files: src_sample_dir = os.path.join(fc_dir_structure['fc_dir'], project['data_dir'], project['project_dir'], sample['sample_dir']) for libprep_obj in sample_obj: for seqrun_obj in libprep_obj: src_fastq_files = [os.path.join(src_sample_dir, fastq_file) for fastq_file in seqrun_obj.fastq_files] seqrun_dst_dir = os.path.join(project_obj.base_path, project_obj.dirname, sample_obj.dirname, libprep_obj.dirname, seqrun_obj.dirname) LOG.info("Symlinking fastq files from {} to {}...".format(src_sample_dir, seqrun_dir)) try: do_symlink(src_fastq_files, seqrun_dir) except OSError: error_text = ('Could not symlink files for project/sample' 'libprep/seqrun {}/{}/{}/{}'.format(project_obj, sample_obj, libprep_obj, seqrun_obj)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_name, level="ERROR", info_text=error_text) continue return projects_to_analyze
def setup_analysis_directory_structure(fc_dir, projects_to_analyze, restrict_to_projects=None, restrict_to_samples=None, create_files=True, fallback_libprep=None, quiet=False, config=None, config_file_path=None): """ Copy and sort files from their CASAVA-demultiplexed flowcell structure into their respective project/sample/libPrep/FCIDs. This collects samples split across multiple flowcells. :param str fc_dir: The directory created by CASAVA for this flowcell. :param dict config: The parsed configuration file. :param set projects_to_analyze: A dict (of Project objects, or empty) :param bool create_files: Alter the filesystem (as opposed to just parsing flowcells) (default True) :param str fallback_libprep: If libprep cannot be determined, use this value if supplied (default None) :param list restrict_to_projects: Specific projects within the flowcell to process exclusively :param list restrict_to_samples: Specific samples within the flowcell to process exclusively :returns: A list of NGIProject objects that need to be run through the analysis pipeline :rtype: list :raises KeyError: If a required configuration key is not available. """ LOG.info( "Setting up analysis for demultiplexed data in source folder \"{}\"". format(fc_dir)) if not restrict_to_projects: restrict_to_projects = [] if not restrict_to_samples: restrict_to_samples = [] config[ "quiet"] = quiet # Hack because I enter here from a script sometimes #Checks flowcell path to establish which group owns it pattern = ".+({}|{})\/.+".format(config["analysis"]["sthlm_root"], config["analysis"]["upps_root"]) matches = re.match(pattern, fc_dir) if matches: flowcell_uppnexid = matches.group(1) else: LOG.error( "cannot guess which project (sthlm/uppsala) the flowcell {} belongs to" .format(fc_dir)) raise RuntimeError analysis_top_dir = os.path.abspath( os.path.join(config["analysis"]["base_root"], flowcell_uppnexid, config["analysis"]["top_dir"])) try: safe_makedir(analysis_top_dir) except OSError as e: LOG.error( 'Error: Analysis top directory {} does not exist and could not ' 'be created.'.format(analysis_top_dir)) fc_dir = fc_dir if os.path.isabs(fc_dir) else os.path.join( analysis_top_dir, fc_dir) if not os.path.exists(fc_dir): LOG.error("Error: Flowcell directory {} does not exist".format(fc_dir)) return [] # Map the directory structure for this flowcell try: fc_dir_structure = parse_flowcell(fc_dir) except (OSError, ValueError) as e: LOG.error("Error when processing flowcell dir \"{}\": {}".format( fc_dir, e)) return [] fc_full_id = fc_dir_structure['fc_full_id'] if not fc_dir_structure.get('projects'): LOG.warning( "No projects found in specified flowcell directory \"{}\"".format( fc_dir)) # Iterate over the projects in the flowcell directory for project in fc_dir_structure.get('projects', []): project_name = project['project_name'] project_original_name = project['project_original_name'] samplesheet_path = fc_dir_structure.get("samplesheet_path") # parse the samplesheet and get the expected sample numbers assigned by bcl2fastq samplesheet_sample_numbers = get_sample_numbers_from_samplesheet( samplesheet_path) if samplesheet_path else None try: # Maps e.g. "Y.Mom_14_01" to "P123" project_id = get_project_id_from_name(project_name) except (CharonError, RuntimeError, ValueError) as e: LOG.warning( 'Could not retrieve project id from Charon (record missing?). ' 'Using project name ("{}") as project id ' '(error: {})'.format(project_name, e)) project_id = project_name # If specific projects are specified, skip those that do not match if restrict_to_projects and project_name not in restrict_to_projects and \ project_id not in restrict_to_projects: LOG.debug( "Skipping project {} (not in restrict_to_projects)".format( project_name)) continue LOG.info("Setting up project {}".format(project.get("project_name"))) # Create a project directory if it doesn't already exist, including # intervening "DATA" directory project_dir = os.path.join(analysis_top_dir, "DATA", project_id) project_sl_dir = os.path.join(analysis_top_dir, "DATA", project_name) project_analysis_dir = os.path.join(analysis_top_dir, "ANALYSIS", project_id) project_analysis_sl_dir = os.path.join(analysis_top_dir, "ANALYSIS", project_name) if create_files: safe_makedir(project_dir, 0o2770) safe_makedir(project_analysis_dir, 0o2770) if not project_dir == project_sl_dir and \ not os.path.exists(project_sl_dir): os.symlink(project_dir, project_sl_dir) if not project_analysis_dir == project_analysis_sl_dir and \ not os.path.exists(project_analysis_sl_dir): os.symlink(project_analysis_dir, project_analysis_sl_dir) try: project_obj = projects_to_analyze[project_dir] except KeyError: project_obj = NGIProject(name=project_name, dirname=project_id, project_id=project_id, base_path=analysis_top_dir) projects_to_analyze[project_dir] = project_obj # Iterate over the samples in the project for sample in project.get('samples', []): sample_name = sample['sample_name'] # If specific samples are specified, skip those that do not match if restrict_to_samples and sample_name not in restrict_to_samples: LOG.debug("Skipping sample {}: not in specified samples " "{}".format(sample_name, ", ".join(restrict_to_samples))) continue LOG.info("Setting up sample {}".format(sample_name)) # Create a directory for the sample if it doesn't already exist sample_dir = os.path.join(project_dir, sample_name) if create_files: safe_makedir(sample_dir, 0o2770) # This will only create a new sample object if it doesn't already exist in the project sample_obj = project_obj.add_sample(name=sample_name, dirname=sample_name) # Get the Library Prep ID for each file pattern = re.compile(".*\.(fastq|fq)(\.gz|\.gzip|\.bz2)?$") fastq_files = list(filter(pattern.match, sample.get('files', []))) # For each fastq file, create the libprep and seqrun objects # and add the fastq file to the seqprep object # Note again that these objects only get created if they don't yet exist; # if they do exist, the existing object is returned for fq_file in fastq_files: # Try to use assignment from SampleSheet samplesheet_sample = match_fastq_sample_number_to_samplesheet( fq_file, samplesheet_sample_numbers, project_id) if samplesheet_sample is not None and \ samplesheet_sample[6] is not None: libprep_name = samplesheet_sample[6] else: LOG.debug( 'Unable to determine library prep from sample sheet file; try to determine from Charon' ) try: # Requires Charon access libprep_name = determine_library_prep_from_fcid( project_id, sample_name, fc_full_id) LOG.debug('Found libprep name "{}" in Charon'.format( libprep_name)) except ValueError: charon_session = CharonSession() libpreps = charon_session.sample_get_libpreps( project_id, sample_name).get('libpreps') if len(libpreps) == 1: libprep_name = libpreps[0].get('libprepid') LOG.warning( 'Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon, but only one ' 'library prep is present in Charon ("{}"). Using ' 'this as the library prep.'.format( project_name, sample_name, fc_full_id, fq_file, libprep_name)) elif fallback_libprep: libprep_name = fallback_libprep LOG.warning( 'Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon, but a fallback ' 'libprep value of "{}" was supplied -- using this ' 'value.'.format(project_name, sample_name, fc_full_id, fq_file, libprep_name)) else: error_text = ( 'Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" ' 'has no libprep information in Charon. Skipping ' 'analysis.'.format(project_name, sample_name, fc_full_id, fq_file)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_name, level="ERROR", info_text=error_text) continue libprep_object = sample_obj.add_libprep(name=libprep_name, dirname=libprep_name) libprep_dir = os.path.join(sample_dir, libprep_name) if create_files: safe_makedir(libprep_dir, 0o2770) seqrun_object = libprep_object.add_seqrun(name=fc_full_id, dirname=fc_full_id) seqrun_dir = os.path.join(libprep_dir, fc_full_id) if create_files: safe_makedir(seqrun_dir, 0o2770) seqrun_object.add_fastq_files(fq_file) if fastq_files and create_files: src_sample_dir = os.path.join(fc_dir_structure['fc_dir'], project['data_dir'], project['project_dir'], sample['sample_dir']) for libprep_obj in sample_obj: for seqrun_obj in libprep_obj: src_fastq_files = [ os.path.join(src_sample_dir, fastq_file) for fastq_file in seqrun_obj.fastq_files ] seqrun_dst_dir = os.path.join(project_obj.base_path, "DATA", project_obj.dirname, sample_obj.dirname, libprep_obj.dirname, seqrun_obj.dirname) LOG.info( "Symlinking fastq files from {} to {}...".format( src_sample_dir, seqrun_dst_dir)) try: do_symlink(src_fastq_files, seqrun_dst_dir) except OSError: error_text = ( 'Could not symlink files for project/sample' 'libprep/seqrun {}/{}/{}/{}'.format( project_obj, sample_obj, libprep_obj, seqrun_obj)) LOG.error(error_text) if not config.get('quiet'): mail_analysis(project_name=project_name, sample_name=sample_name, level="ERROR", info_text=error_text) continue return projects_to_analyze