Example #1
0
def write_to_charon_NGI_results(job_id, return_code, run_dir):
    """Update the status of a sequencing run after alignment.

    :param NGIProject project_id: The name of the project, sample, lib prep, flowcell id
    :param int return_code: The return code of the workflow process
    :param string run_dir: the directory where results are stored (I know that I am running piper)

    :raises RuntimeError: If the Charon database could not be updated
    """
    charon_session = CharonSession()
    # Consider moving this mapping to the CharonSession object or something
    if return_code is None:
        status = "RUNNING"
    elif return_code == 0:
        status = "DONE"
    else:
        ## TODO we need to differentiate between COMPUTATION_FAILED and DATA_FAILED
        ##      also there is IGNORE?
        status = "COMPUTATION_FAILED"
    try:
        m_dict = STHLM_UUSNP_SAMPLE_RE.match(job_id).groupdict()
        #m_dict = re.match(r'?P<project_name>\w\.\w+_\d+_\d+|\w{2}-\d+)_(?P<sample_id>[\w-]+)_(?P<libprep_id>\w|\w{2}\d{3}_\2)_(?P<seqrun_id>\d{6}_\w+_\d{4}_.{10})', job_id).groupdict()
        project_id = get_project_id_from_name(m_dict['project_name'])
        sample_id = m_dict['sample_id']
    except (TypeError, AttributeError):
        error_msg = "Could not parse project/sample ids from job id \"{}\"; cannot update Charon with results!".format(
            job_id)
        raise RuntimeError(error_msg)
    try:
        charon_session.sample_update(project_id, sample_id, status=status)
    except CharonError as e:
        error_msg = ('Failed to update sample status to "{}" for sample "{}" '
                     'in Charon database: {}'.format(status, project_id,
                                                     sample_id, e))
        raise RuntimeError(error_msg)
def write_to_charon_NGI_results(job_id, return_code, run_dir):
    """Update the status of a sequencing run after alignment.

    :param NGIProject project_id: The name of the project, sample, lib prep, flowcell id
    :param int return_code: The return code of the workflow process
    :param string run_dir: the directory where results are stored (I know that I am running piper)

    :raises RuntimeError: If the Charon database could not be updated
    """
    charon_session = CharonSession()
    # Consider moving this mapping to the CharonSession object or something
    if return_code is None:
        status = "RUNNING"
    elif return_code == 0:
        status = "DONE"
    else:
        ## TODO we need to differentiate between COMPUTATION_FAILED and DATA_FAILED
        ##      also there is IGNORE?
        status = "COMPUTATION_FAILED"
    try:
        m_dict = STHLM_UUSNP_SAMPLE_RE.match(job_id).groupdict()
        #m_dict = re.match(r'?P<project_name>\w\.\w+_\d+_\d+|\w{2}-\d+)_(?P<sample_id>[\w-]+)_(?P<libprep_id>\w|\w{2}\d{3}_\2)_(?P<seqrun_id>\d{6}_\w+_\d{4}_.{10})', job_id).groupdict()
        project_id = get_project_id_from_name(m_dict['project_name'])
        sample_id = m_dict['sample_id']
    except (TypeError, AttributeError):
        error_msg = "Could not parse project/sample ids from job id \"{}\"; cannot update Charon with results!".format(job_id)
        raise RuntimeError(error_msg)
    try:
        charon_session.sample_update(project_id, sample_id, status=status)
    except CharonError as e:
        error_msg = ('Failed to update sample status to "{}" for sample "{}" '
                     'in Charon database: {}'.format(status, project_id, sample_id, e))
        raise RuntimeError(error_msg)
Example #3
0
def setup_analysis_directory_structure(fc_dir, projects_to_analyze,
                                       restrict_to_projects=None, restrict_to_samples=None,
                                       create_files=True,
                                       fallback_libprep=None,
                                       quiet=False,
                                       config=None, config_file_path=None):
    """
    Copy and sort files from their CASAVA-demultiplexed flowcell structure
    into their respective project/sample/libPrep/FCIDs. This collects samples
    split across multiple flowcells.

    :param str fc_dir: The directory created by CASAVA for this flowcell.
    :param dict config: The parsed configuration file.
    :param set projects_to_analyze: A dict (of Project objects, or empty)
    :param bool create_files: Alter the filesystem (as opposed to just parsing flowcells) (default True)
    :param str fallback_libprep: If libprep cannot be determined, use this value if supplied (default None)
    :param list restrict_to_projects: Specific projects within the flowcell to process exclusively
    :param list restrict_to_samples: Specific samples within the flowcell to process exclusively

    :returns: A list of NGIProject objects that need to be run through the analysis pipeline
    :rtype: list

    :raises KeyError: If a required configuration key is not available.
    """
    LOG.info("Setting up analysis for demultiplexed data in source folder \"{}\"".format(fc_dir))
    if not restrict_to_projects: restrict_to_projects = []
    if not restrict_to_samples: restrict_to_samples = []
    config["quiet"] = quiet # Hack because I enter here from a script sometimes
    pattern="(.+(?:{}|{}))\/.+".format(config["analysis"]["sthlm_root"], config["analysis"]["upps_root"])
    matches=re.match(pattern, fc_dir)
    if matches:
        flowcell_root=matches.group(1)
    else:
        LOG.error("cannot guess which project the flowcell {} belongs to".format(fc_dir))
        raise RuntimeError

    analysis_top_dir = os.path.abspath(os.path.join(flowcell_root,config["analysis"]["top_dir"]))
    try:
        safe_makedir(analysis_top_dir)
    except OSError as e:
        LOG.error('Error: Analysis top directory {} does not exist and could not '
                  'be created.'.format(analysis_top_dir))
    fc_dir = fc_dir if os.path.isabs(fc_dir) else os.path.join(analysis_top_dir, fc_dir)
    if not os.path.exists(fc_dir):
        LOG.error("Error: Flowcell directory {} does not exist".format(fc_dir))
        return []
    # Map the directory structure for this flowcell
    try:
        fc_dir_structure = parse_flowcell(fc_dir)
    except (OSError, ValueError) as e:
        LOG.error("Error when processing flowcell dir \"{}\": {}".format(fc_dir, e))
        return []
    fc_full_id = fc_dir_structure['fc_full_id']
    if not fc_dir_structure.get('projects'):
        LOG.warn("No projects found in specified flowcell directory \"{}\"".format(fc_dir))
    # Iterate over the projects in the flowcell directory
    for project in fc_dir_structure.get('projects', []):
        project_name = project['project_name']
        project_original_name = project['project_original_name']
        samplesheet_path = fc_dir_structure.get("samplesheet_path")
        try:
            # Maps e.g. "Y.Mom_14_01" to "P123"
            project_id = get_project_id_from_name(project_name)
        except (CharonError, RuntimeError, ValueError) as e:
            LOG.warn('Could not retrieve project id from Charon (record missing?). '
                     'Using project name ("{}") as project id '
                     '(error: {})'.format(project_name, e))
            project_id = project_name
        # If specific projects are specified, skip those that do not match
        if restrict_to_projects and project_name not in restrict_to_projects and \
                                    project_id not in restrict_to_projects:
            LOG.debug("Skipping project {} (not in restrict_to_projects)".format(project_name))
            continue
        LOG.info("Setting up project {}".format(project.get("project_name")))
        # Create a project directory if it doesn't already exist, including
        # intervening "DATA" directory
        project_dir = os.path.join(analysis_top_dir, "DATA", project_id)
        project_sl_dir = os.path.join(analysis_top_dir, "DATA", project_name)
        project_analysis_dir = os.path.join(analysis_top_dir, "ANALYSIS", project_id)
        project_analysis_sl_dir = os.path.join(analysis_top_dir, "ANALYSIS", project_name)
        if create_files:
            safe_makedir(project_dir, 0o2770)
            safe_makedir(project_analysis_dir, 0o2770)
            if not project_dir == project_sl_dir and \
               not os.path.exists(project_sl_dir):
                os.symlink(project_dir, project_sl_dir)
            if not project_analysis_dir == project_analysis_sl_dir and \
               not os.path.exists(project_analysis_sl_dir):
                os.symlink(project_analysis_dir, project_analysis_sl_dir)
        try:
            project_obj = projects_to_analyze[project_dir]
        except KeyError:
            project_obj = NGIProject(name=project_name, dirname=project_id,
                                     project_id=project_id,
                                     base_path=analysis_top_dir)
            projects_to_analyze[project_dir] = project_obj
        # Iterate over the samples in the project
        for sample in project.get('samples', []):
            sample_name = sample['sample_name']
            # If specific samples are specified, skip those that do not match
            if restrict_to_samples and sample_name not in restrict_to_samples:
                LOG.debug("Skipping sample {}: not in specified samples "
                          "{}".format(sample_name, ", ".join(restrict_to_samples)))
                continue
            LOG.info("Setting up sample {}".format(sample_name))
            # Create a directory for the sample if it doesn't already exist
            sample_dir = os.path.join(project_dir, sample_name)
            if create_files: safe_makedir(sample_dir, 0o2770)
            # This will only create a new sample object if it doesn't already exist in the project
            sample_obj = project_obj.add_sample(name=sample_name, dirname=sample_name)
            # Get the Library Prep ID for each file
            pattern = re.compile(".*\.(fastq|fq)(\.gz|\.gzip|\.bz2)?$")
            fastq_files = filter(pattern.match, sample.get('files', []))
            # For each fastq file, create the libprep and seqrun objects
            # and add the fastq file to the seqprep object
            # Note again that these objects only get created if they don't yet exist;
            # if they do exist, the existing object is returned
            for fq_file in fastq_files:
                # Try to parse from SampleSheet
                try:
                    if not samplesheet_path: raise ValueError()
                    lane_num = re.match(r'[\w-]+_L\d{2}(\d)_\w+', fq_file).groups()[0]
                    libprep_name = determine_library_prep_from_samplesheet(samplesheet_path,
                                                                           project_original_name,
                                                                           sample_name,
                                                                           lane_num)
                except (IndexError, ValueError) as e:
                    LOG.debug('Unable to determine library prep from sample sheet file '
                              '("{}"); try to determine from Charon'.format(e))
                    try:
                        # Requires Charon access
                        libprep_name = determine_library_prep_from_fcid(project_id, sample_name, fc_full_id)
                        LOG.debug('Found libprep name "{}" in Charon'.format(libprep_name))
                    except ValueError:
                        charon_session = CharonSession()
                        libpreps = charon_session.sample_get_libpreps(project_id, sample_name).get('libpreps')
                        if len(libpreps) == 1:
                            libprep_name = libpreps[0].get('libprepid')
                            LOG.warn('Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" '
                                     'has no libprep information in Charon, but only one '
                                     'library prep is present in Charon ("{}"). Using '
                                     'this as the library prep.'.format(project_name,
                                                                        sample_name,
                                                                        fc_full_id,
                                                                        fq_file,
                                                                        libprep_name))
                        elif fallback_libprep:
                            libprep_name = fallback_libprep
                            LOG.warn('Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" '
                                     'has no libprep information in Charon, but a fallback '
                                     'libprep value of "{}" was supplied -- using this '
                                     'value.'.format(project_name,
                                                     sample_name,
                                                     fc_full_id,
                                                     fq_file,
                                                     libprep_name,
                                                     fallback_libprep))
                        else:
                            error_text = ('Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" '
                                          'has no libprep information in Charon. Skipping '
                                          'analysis.'.format(project_name, sample_name,
                                                             fc_full_id, fq_file))
                            LOG.error(error_text)
                            if not config.get('quiet'):
                                mail_analysis(project_name=project_name,
                                              sample_name=sample_name,
                                              level="ERROR",
                                              info_text=error_text)
                            continue
                libprep_object = sample_obj.add_libprep(name=libprep_name,
                                                        dirname=libprep_name)
                libprep_dir = os.path.join(sample_dir, libprep_name)
                if create_files: safe_makedir(libprep_dir, 0o2770)
                seqrun_object = libprep_object.add_seqrun(name=fc_full_id,
                                                          dirname=fc_full_id)
                seqrun_dir = os.path.join(libprep_dir, fc_full_id)
                if create_files: safe_makedir(seqrun_dir, 0o2770)
                seqrun_object.add_fastq_files(fq_file)
            if fastq_files and create_files:
                src_sample_dir = os.path.join(fc_dir_structure['fc_dir'],
                                              project['data_dir'],
                                              project['project_dir'],
                                              sample['sample_dir'])
                for libprep_obj in sample_obj:
                    for seqrun_obj in libprep_obj:
                        src_fastq_files = [os.path.join(src_sample_dir, fastq_file) for
                                           fastq_file in seqrun_obj.fastq_files]
                        seqrun_dst_dir = os.path.join(project_obj.base_path, project_obj.dirname,
                                                      sample_obj.dirname, libprep_obj.dirname,
                                                      seqrun_obj.dirname)
                        LOG.info("Symlinking fastq files from {} to {}...".format(src_sample_dir, seqrun_dir))
                        try:
                            do_symlink(src_fastq_files, seqrun_dir)
                        except OSError:
                            error_text = ('Could not symlink files for project/sample'
                                          'libprep/seqrun {}/{}/{}/{}'.format(project_obj,
                                                                              sample_obj,
                                                                              libprep_obj,
                                                                              seqrun_obj))
                            LOG.error(error_text)
                            if not config.get('quiet'):
                                mail_analysis(project_name=project_name,
                                              sample_name=sample_name,
                                              level="ERROR",
                                              info_text=error_text)
                            continue
    return projects_to_analyze
Example #4
0
def setup_analysis_directory_structure(fc_dir,
                                       projects_to_analyze,
                                       restrict_to_projects=None,
                                       restrict_to_samples=None,
                                       create_files=True,
                                       fallback_libprep=None,
                                       quiet=False,
                                       config=None,
                                       config_file_path=None):
    """
    Copy and sort files from their CASAVA-demultiplexed flowcell structure
    into their respective project/sample/libPrep/FCIDs. This collects samples
    split across multiple flowcells.

    :param str fc_dir: The directory created by CASAVA for this flowcell.
    :param dict config: The parsed configuration file.
    :param set projects_to_analyze: A dict (of Project objects, or empty)
    :param bool create_files: Alter the filesystem (as opposed to just parsing flowcells) (default True)
    :param str fallback_libprep: If libprep cannot be determined, use this value if supplied (default None)
    :param list restrict_to_projects: Specific projects within the flowcell to process exclusively
    :param list restrict_to_samples: Specific samples within the flowcell to process exclusively

    :returns: A list of NGIProject objects that need to be run through the analysis pipeline
    :rtype: list

    :raises KeyError: If a required configuration key is not available.
    """
    LOG.info(
        "Setting up analysis for demultiplexed data in source folder \"{}\"".
        format(fc_dir))
    if not restrict_to_projects: restrict_to_projects = []
    if not restrict_to_samples: restrict_to_samples = []
    config[
        "quiet"] = quiet  # Hack because I enter here from a script sometimes
    #Checks flowcell path to establish which group owns it
    pattern = ".+({}|{})\/.+".format(config["analysis"]["sthlm_root"],
                                     config["analysis"]["upps_root"])
    matches = re.match(pattern, fc_dir)
    if matches:
        flowcell_uppnexid = matches.group(1)
    else:
        LOG.error(
            "cannot guess which project (sthlm/uppsala) the flowcell {} belongs to"
            .format(fc_dir))
        raise RuntimeError

    analysis_top_dir = os.path.abspath(
        os.path.join(config["analysis"]["base_root"], flowcell_uppnexid,
                     config["analysis"]["top_dir"]))
    try:
        safe_makedir(analysis_top_dir)
    except OSError as e:
        LOG.error(
            'Error: Analysis top directory {} does not exist and could not '
            'be created.'.format(analysis_top_dir))
    fc_dir = fc_dir if os.path.isabs(fc_dir) else os.path.join(
        analysis_top_dir, fc_dir)
    if not os.path.exists(fc_dir):
        LOG.error("Error: Flowcell directory {} does not exist".format(fc_dir))
        return []
    # Map the directory structure for this flowcell
    try:
        fc_dir_structure = parse_flowcell(fc_dir)
    except (OSError, ValueError) as e:
        LOG.error("Error when processing flowcell dir \"{}\": {}".format(
            fc_dir, e))
        return []
    fc_full_id = fc_dir_structure['fc_full_id']
    if not fc_dir_structure.get('projects'):
        LOG.warning(
            "No projects found in specified flowcell directory \"{}\"".format(
                fc_dir))

    # Iterate over the projects in the flowcell directory
    for project in fc_dir_structure.get('projects', []):
        project_name = project['project_name']
        project_original_name = project['project_original_name']
        samplesheet_path = fc_dir_structure.get("samplesheet_path")

        # parse the samplesheet and get the expected sample numbers assigned by bcl2fastq
        samplesheet_sample_numbers = get_sample_numbers_from_samplesheet(
            samplesheet_path) if samplesheet_path else None

        try:
            # Maps e.g. "Y.Mom_14_01" to "P123"
            project_id = get_project_id_from_name(project_name)
        except (CharonError, RuntimeError, ValueError) as e:
            LOG.warning(
                'Could not retrieve project id from Charon (record missing?). '
                'Using project name ("{}") as project id '
                '(error: {})'.format(project_name, e))
            project_id = project_name
        # If specific projects are specified, skip those that do not match
        if restrict_to_projects and project_name not in restrict_to_projects and \
                                    project_id not in restrict_to_projects:
            LOG.debug(
                "Skipping project {} (not in restrict_to_projects)".format(
                    project_name))
            continue
        LOG.info("Setting up project {}".format(project.get("project_name")))
        # Create a project directory if it doesn't already exist, including
        # intervening "DATA" directory
        project_dir = os.path.join(analysis_top_dir, "DATA", project_id)
        project_sl_dir = os.path.join(analysis_top_dir, "DATA", project_name)
        project_analysis_dir = os.path.join(analysis_top_dir, "ANALYSIS",
                                            project_id)
        project_analysis_sl_dir = os.path.join(analysis_top_dir, "ANALYSIS",
                                               project_name)
        if create_files:
            safe_makedir(project_dir, 0o2770)
            safe_makedir(project_analysis_dir, 0o2770)
            if not project_dir == project_sl_dir and \
               not os.path.exists(project_sl_dir):
                os.symlink(project_dir, project_sl_dir)
            if not project_analysis_dir == project_analysis_sl_dir and \
               not os.path.exists(project_analysis_sl_dir):
                os.symlink(project_analysis_dir, project_analysis_sl_dir)
        try:
            project_obj = projects_to_analyze[project_dir]
        except KeyError:
            project_obj = NGIProject(name=project_name,
                                     dirname=project_id,
                                     project_id=project_id,
                                     base_path=analysis_top_dir)
            projects_to_analyze[project_dir] = project_obj
        # Iterate over the samples in the project
        for sample in project.get('samples', []):
            sample_name = sample['sample_name']
            # If specific samples are specified, skip those that do not match
            if restrict_to_samples and sample_name not in restrict_to_samples:
                LOG.debug("Skipping sample {}: not in specified samples "
                          "{}".format(sample_name,
                                      ", ".join(restrict_to_samples)))
                continue
            LOG.info("Setting up sample {}".format(sample_name))
            # Create a directory for the sample if it doesn't already exist
            sample_dir = os.path.join(project_dir, sample_name)
            if create_files: safe_makedir(sample_dir, 0o2770)
            # This will only create a new sample object if it doesn't already exist in the project
            sample_obj = project_obj.add_sample(name=sample_name,
                                                dirname=sample_name)
            # Get the Library Prep ID for each file
            pattern = re.compile(".*\.(fastq|fq)(\.gz|\.gzip|\.bz2)?$")
            fastq_files = list(filter(pattern.match, sample.get('files', [])))
            # For each fastq file, create the libprep and seqrun objects
            # and add the fastq file to the seqprep object
            # Note again that these objects only get created if they don't yet exist;
            # if they do exist, the existing object is returned
            for fq_file in fastq_files:
                # Try to use assignment from SampleSheet
                samplesheet_sample = match_fastq_sample_number_to_samplesheet(
                    fq_file, samplesheet_sample_numbers, project_id)
                if samplesheet_sample is not None and \
                        samplesheet_sample[6] is not None:
                    libprep_name = samplesheet_sample[6]
                else:
                    LOG.debug(
                        'Unable to determine library prep from sample sheet file; try to determine from Charon'
                    )
                    try:
                        # Requires Charon access
                        libprep_name = determine_library_prep_from_fcid(
                            project_id, sample_name, fc_full_id)
                        LOG.debug('Found libprep name "{}" in Charon'.format(
                            libprep_name))
                    except ValueError:
                        charon_session = CharonSession()
                        libpreps = charon_session.sample_get_libpreps(
                            project_id, sample_name).get('libpreps')
                        if len(libpreps) == 1:
                            libprep_name = libpreps[0].get('libprepid')
                            LOG.warning(
                                'Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" '
                                'has no libprep information in Charon, but only one '
                                'library prep is present in Charon ("{}"). Using '
                                'this as the library prep.'.format(
                                    project_name, sample_name, fc_full_id,
                                    fq_file, libprep_name))
                        elif fallback_libprep:
                            libprep_name = fallback_libprep
                            LOG.warning(
                                'Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" '
                                'has no libprep information in Charon, but a fallback '
                                'libprep value of "{}" was supplied -- using this '
                                'value.'.format(project_name, sample_name,
                                                fc_full_id, fq_file,
                                                libprep_name))
                        else:
                            error_text = (
                                'Project "{}" / sample "{}" / seqrun "{}" / fastq "{}" '
                                'has no libprep information in Charon. Skipping '
                                'analysis.'.format(project_name, sample_name,
                                                   fc_full_id, fq_file))
                            LOG.error(error_text)
                            if not config.get('quiet'):
                                mail_analysis(project_name=project_name,
                                              sample_name=sample_name,
                                              level="ERROR",
                                              info_text=error_text)
                            continue
                libprep_object = sample_obj.add_libprep(name=libprep_name,
                                                        dirname=libprep_name)
                libprep_dir = os.path.join(sample_dir, libprep_name)
                if create_files: safe_makedir(libprep_dir, 0o2770)
                seqrun_object = libprep_object.add_seqrun(name=fc_full_id,
                                                          dirname=fc_full_id)
                seqrun_dir = os.path.join(libprep_dir, fc_full_id)
                if create_files: safe_makedir(seqrun_dir, 0o2770)
                seqrun_object.add_fastq_files(fq_file)
            if fastq_files and create_files:
                src_sample_dir = os.path.join(fc_dir_structure['fc_dir'],
                                              project['data_dir'],
                                              project['project_dir'],
                                              sample['sample_dir'])
                for libprep_obj in sample_obj:
                    for seqrun_obj in libprep_obj:
                        src_fastq_files = [
                            os.path.join(src_sample_dir, fastq_file)
                            for fastq_file in seqrun_obj.fastq_files
                        ]
                        seqrun_dst_dir = os.path.join(project_obj.base_path,
                                                      "DATA",
                                                      project_obj.dirname,
                                                      sample_obj.dirname,
                                                      libprep_obj.dirname,
                                                      seqrun_obj.dirname)
                        LOG.info(
                            "Symlinking fastq files from {} to {}...".format(
                                src_sample_dir, seqrun_dst_dir))
                        try:
                            do_symlink(src_fastq_files, seqrun_dst_dir)
                        except OSError:
                            error_text = (
                                'Could not symlink files for project/sample'
                                'libprep/seqrun {}/{}/{}/{}'.format(
                                    project_obj, sample_obj, libprep_obj,
                                    seqrun_obj))
                            LOG.error(error_text)
                            if not config.get('quiet'):
                                mail_analysis(project_name=project_name,
                                              sample_name=sample_name,
                                              level="ERROR",
                                              info_text=error_text)
                            continue
    return projects_to_analyze
 def test_get_project_id_from_name(self):
     # Check that it matches
     self.assertEqual(self.project_id,
                      get_project_id_from_name(self.project_name))
Example #6
0
 def test_get_project_id_from_name_missing_id(self, mock_get):
     """Raise ValueError if 'projectid' is missing"""
     mock_get.return_value = {}
     with self.assertRaises(ValueError):
         get_project_id_from_name(self.project_name)
Example #7
0
 def test_get_project_id_from_name_missing_proj(self, mock_get):
     """Raise ValueError if project is missing"""
     mock_get.side_effect = CharonError('Error', status_code=404)
     with self.assertRaises(ValueError):
         get_project_id_from_name(self.project_name)
Example #8
0
 def test_get_project_id_from_name(self, mock_get):
     """Return project ID given the project name"""
     mock_get.return_value = {'projectid': 'P100000'}
     self.assertEqual(self.project_id,
                      get_project_id_from_name(self.project_name))
def main(args):
    originalProject = {}
    originalProject["fc_dir"] = "/proj/a2010002/INBOX/140702_D00415_0052_AC41A2ANXX/"
    originalProject["fc_name"] = "140702_D00415_0052_AC41A2ANXX"
    originalProject["fc_id"] = "C41A2ANXX"
    originalProject["project_name"] = "M.Kaller_14_06"
    originalProject["project_name_ill"] = "M__Kaller_14_06"
    originalProject["project_id"] = "P1171"
    originalProject["samples_id"] = ["102", "104", "106", "108"]

    ##create random
    rnd_fc_id_noplate = id_generator_digits_chars(9)
    rnd_fc_id = "A{}".format(rnd_fc_id_noplate)
    rnd_instrument = id_generator_digits_chars(6)
    rnd_date = id_generator_digits(6)
    rnd_fc_name = "{}_{}_{}_{}".format(rnd_date, rnd_instrument, id_generator_digits(4), rnd_fc_id)

    rnd_fc_path = os.path.join(data_folder, rnd_fc_name)
    if os.path.isdir(rnd_fc_path):
        print "flowcell name already exists: bad luck!!!! Abort"
        return 1
    rnd_project_name = args.rnd_project_name
    if args.rnd_project_name is "":
        print "error project-name must be specified (something like M.Kaller_14_06)"
        return 1

    charon_session = CharonSession()

    rndProject = {}
    try:
        rnd_project_id = get_project_id_from_name(rnd_project_name)
        rndProject["project_id"] = rnd_project_id
        rndProject["project_name"] = rnd_project_name
    except (RuntimeError, ValueError) as e:
        print " project does not exits on Charon, creating it"
        rnd_project_id = "P{}".format(id_generator_digits(4))
        rndProject["project_id"] = rnd_project_id
        rndProject["project_name"] = rnd_project_name

        base_url = charon_session.construct_charon_url("project")
        project_dict = {
            "projectid": rndProject["project_id"],
            "name": rndProject["project_name"],
            "status": "SEQUENCED",
            "pipeline": "NGI",
            "best_practice_analysis": "IGN",
            "sequencing_facility": "NGI-S",
        }
        # create the project on charon
        charon_session.post(base_url, json.dumps(project_dict))

    rndProject["fc_dir"] = rnd_fc_path
    rndProject["fc_name"] = rnd_fc_name
    rndProject["fc_id"] = rnd_fc_id

    rndProject["project_name_ill"] = rnd_project_name.replace(".", "__")

    rndProject["samples_id"] = [
        "{}".format(id_generator_digits(3)),
        "{}".format(id_generator_digits(3)),
        "{}".format(id_generator_digits(3)),
        "{}".format(id_generator_digits(3)),
    ]
    if args.restrict_to_sample is not "":
        originalProject["samples_id"] = [args.restrict_to_sample]
        rndProject["samples_id"] = ["{}".format(id_generator_digits(3))]

    # TODO: check that this project does not already exists on charon
    os.mkdir(rnd_fc_path)
    # parse SampleSheet_16bp.csv
    parse_sample_sheet("SampleSheet_16bp.csv", originalProject, rndProject)
    # parse SampleSheet.csv
    parse_sample_sheet("SampleSheet.csv", originalProject, rndProject)

    createDir(rndProject["fc_dir"], "Data")
    createDir(rndProject["fc_dir"], "InterOp")

    # Unaligned
    createDir(rndProject["fc_dir"], "Unaligned")
    Unaligned_dir = os.path.join(rndProject["fc_dir"], "Unaligned")
    BaseCall_stats_dir = "Basecall_Stats_{}".format(rndProject["fc_id"])
    createDir(Unaligned_dir, BaseCall_stats_dir)
    # I do not need to copy the file... I hope as it is madness parse them

    # Unaligned_16bp
    createDir(rndProject["fc_dir"], "Unaligned_16bp")
    Unaligned_path = os.path.join(rndProject["fc_dir"], "Unaligned_16bp")
    BaseCall_stats_dir = "Basecall_Stats_{}".format(rndProject["fc_id"])
    createDir(Unaligned_path, BaseCall_stats_dir)
    Project_dir = "Project_{}".format(rndProject["project_name_ill"])
    createDir(Unaligned_path, Project_dir)
    # need to create samples now
    Project_path = os.path.join(Unaligned_path, Project_dir)
    rndSamplePos = 0

    for originalSample in originalProject["samples_id"]:
        rndSample = rndProject["samples_id"][rndSamplePos]
        sample_dir = "Sample_{}_{}".format(rndProject["project_id"], rndSample)
        createDir(Project_path, sample_dir)
        Sample_path = os.path.join(Project_path, sample_dir)
        # now hard link or sub-samples fastq files
        originalProject_dir = "Project_{}".format(originalProject["project_name_ill"])
        originalSampleDir = "Sample_{}_{}".format(originalProject["project_id"], originalSample)
        originalSamplePath = os.path.join(
            originalProject["fc_dir"], "Unaligned_16bp", originalProject_dir, originalSampleDir
        )
        pairs_to_extract_per_lane = 0

        ##create new sample
        sample_url = charon_session.construct_charon_url("sample", rndProject["project_id"])
        sample_dict = {
            "sampleid": "{}_{}".format(rndProject["project_id"], rndSample),
            "status": "NEW",
            "received": "2014-04-17",
            "qc_status": "NEW",
            "genotyping_status": None,
            "genotyping_concordance": None,
            "lims_initial_qc": "Passed",
            "total_autosomal_coverage": 0,
            "total_sequenced_reads": 0,
        }
        charon_session.post(sample_url, json.dumps(sample_dict))
        # create new library prep
        libprep_url = charon_session.construct_charon_url(
            "libprep", rndProject["project_id"], "{}_{}".format(rndProject["project_id"], rndSample)
        )
        libprep_dict = {"libprepid": "A", "limsid": "24-44506", "status": "NEW"}
        charon_session.post(libprep_url, json.dumps(libprep_dict))
        # create seq run

        seqrun_url = charon_session.construct_charon_url(
            "seqrun", rndProject["project_id"], "{}_{}".format(rndProject["project_id"], rndSample), "A"
        )
        seqrun_dict = {
            "seqrunid": rnd_fc_name,
            "sequencing_status": "DONE",
            #               'mean_autosomal_coverage' : 0
        }
        charon_session.post(seqrun_url, json.dumps(seqrun_dict))

        if args.sample_cov > 0:
            # I know that I have 8 lanes
            reads_to_extract = (args.sample_cov * 3200000000) / 125
            pairs_to_extract = reads_to_extract / 2
            pairs_to_extract_per_lane = pairs_to_extract / 8

        for fastq in [
            fastq
            for fastq in listdir(originalSamplePath)
            if isfile(join(originalSamplePath, fastq)) and fastq.endswith("fastq.gz")
        ]:
            originalFastq = os.path.join(originalSamplePath, fastq)

            rndFastqName = fastq.replace(
                "{}_{}".format(originalProject["project_id"], originalSample),
                "{}_{}".format(rndProject["project_id"], rndSample),
            )
            rndFastq = os.path.join(Sample_path, rndFastqName)
            if args.sample_cov == 0:
                os.link(originalFastq, rndFastq)
            else:
                downsample(originalFastq, rndFastq, pairs_to_extract_per_lane)

        rndSamplePos += 1

    createDir(Unaligned_dir, "Temp")
    # I try to not consider these guys here
    createDir(Unaligned_dir, "Undetermined_indices")
    # I try to not consider these guys here

    produceRunInfo(rndProject["fc_dir"], rnd_fc_name, rnd_fc_id_noplate, rnd_instrument, rnd_date)
    os.link(
        "/proj/a2010002/INBOX/140702_D00415_0052_AC41A2ANXX/runParameters.xml",
        os.path.join(rnd_fc_path, "runParameters.xml"),
    )
from ngi_pipeline.database.communicate import get_project_id_from_name

if __name__=="__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-p", "--project", required=True)
    parser.add_argument("-s", "--sample", required=True)
    parser.add_argument("-c", "--coverage", type=int, required=True, dest="required_coverage")

    args = parser.parse_args()

    project = args.project
    sample = args.sample
    required_coverage = args.required_coverage

    charon_session = CharonSession()
    try:
        reported_coverage = charon_session.sample_get(project, sample).get("total_autosomal_coverage")
    except CharonError as e:
        try:
            project = get_project_id_from_name(project)
        except (CharonError, RuntimeError, ValueError) as e:
            print(('ERROR: Could not determine coverage for project {} / sample '
                    '{}: {}'.format(project, sample, e)), file=sys.stderr)
            reported_coverage = 0
        else:
            reported_coverage = charon_session.sample_get(project, sample).get("total_autosomal_coverage")
    if int(reported_coverage) >= int(required_coverage):
        sys.exit(0)
    else:
        sys.exit(1)
                        type=int,
                        required=True,
                        dest="required_coverage")

    args = parser.parse_args()

    project = args.project
    sample = args.sample
    required_coverage = args.required_coverage

    charon_session = CharonSession()
    try:
        reported_coverage = charon_session.sample_get(
            project, sample).get("total_autosomal_coverage")
    except CharonError as e:
        try:
            project = get_project_id_from_name(project)
        except (CharonError, RuntimeError, ValueError) as e:
            print(
                ('ERROR: Could not determine coverage for project {} / sample '
                 '{}: {}'.format(project, sample, e)),
                file=sys.stderr)
            reported_coverage = 0
        else:
            reported_coverage = charon_session.sample_get(
                project, sample).get("total_autosomal_coverage")
    if int(reported_coverage) >= int(required_coverage):
        sys.exit(0)
    else:
        sys.exit(1)
def main(args):
    originalProject = {}
    originalProject["fc_dir"]           = "/proj/a2010002/INBOX/140702_D00415_0052_AC41A2ANXX/"
    originalProject["fc_name"]          = "140702_D00415_0052_AC41A2ANXX"
    originalProject["fc_id"]            = "C41A2ANXX"
    originalProject["project_name"]     = "M.Kaller_14_06"
    originalProject["project_name_ill"] = "M__Kaller_14_06"
    originalProject["project_id"]       = "P1171"
    originalProject["samples_id"]       = ["102", "104" , "106", "108"]

    ##create random
    rnd_fc_id_noplate  = id_generator_digits_chars(9)
    rnd_fc_id          = "A{}".format(rnd_fc_id_noplate)
    rnd_instrument     = id_generator_digits_chars(6)
    rnd_date           = id_generator_digits(6)
    rnd_fc_name = "{}_{}_{}_{}".format(rnd_date,
                                       rnd_instrument,
                                       id_generator_digits(4),
                                       rnd_fc_id)
    
    rnd_fc_path = os.path.join(data_folder, rnd_fc_name)
    if os.path.isdir(rnd_fc_path):
        print "flowcell name already exists: bad luck!!!! Abort"
        return 1
    rnd_project_name = args.rnd_project_name
    if args.rnd_project_name is "":
        print "error project-name must be specified (something like M.Kaller_14_06)"
        return 1
    
    
    charon_session = CharonSession()


    rndProject = {}
    try:
        rnd_project_id = get_project_id_from_name(rnd_project_name)
        rndProject["project_id"]       = rnd_project_id
        rndProject["project_name"]     = rnd_project_name
    except (RuntimeError, ValueError) as e:
        print " project does not exits on Charon, creating it"
        rnd_project_id   = "P{}".format(id_generator_digits(4))
        rndProject["project_id"]       = rnd_project_id
        rndProject["project_name"]     = rnd_project_name

        base_url = charon_session.construct_charon_url("project")
        project_dict = {'projectid': rndProject["project_id"],
               'name': rndProject["project_name"],
               'status':'SEQUENCED',
               'pipeline':'NGI',
               'best_practice_analysis':'IGN',
               'sequencing_facility':'NGI-S'
               }
        #create the project on charon
        charon_session.post(base_url, json.dumps(project_dict))


    rndProject["fc_dir"]           = rnd_fc_path
    rndProject["fc_name"]          = rnd_fc_name
    rndProject["fc_id"]            = rnd_fc_id

    rndProject["project_name_ill"] = rnd_project_name.replace(".", "__");

    rndProject["samples_id"]       = ["{}".format(id_generator_digits(3)),
                                      "{}".format(id_generator_digits(3)),
                                      "{}".format(id_generator_digits(3)),
                                      "{}".format(id_generator_digits(3))]
    if args.restrict_to_sample is not "":
        originalProject["samples_id"] = [args.restrict_to_sample]
        rndProject["samples_id"]      = ["{}".format(id_generator_digits(3))]




    #TODO: check that this project does not already exists on charon
    os.mkdir(rnd_fc_path)
    #parse SampleSheet_16bp.csv
    parse_sample_sheet("SampleSheet_16bp.csv", originalProject, rndProject)
    #parse SampleSheet.csv
    parse_sample_sheet("SampleSheet.csv", originalProject, rndProject)


    createDir(rndProject["fc_dir"], "Data")
    createDir(rndProject["fc_dir"], "InterOp")
    
    #Unaligned
    createDir(rndProject["fc_dir"], "Unaligned")
    Unaligned_dir           = os.path.join(rndProject["fc_dir"], "Unaligned")
    BaseCall_stats_dir      = "Basecall_Stats_{}".format(rndProject["fc_id"])
    createDir(Unaligned_dir, BaseCall_stats_dir)
    #I do not need to copy the file... I hope as it is madness parse them

    #Unaligned_16bp
    createDir(rndProject["fc_dir"], "Unaligned_16bp")
    Unaligned_path          = os.path.join(rndProject["fc_dir"], "Unaligned_16bp")
    BaseCall_stats_dir      = "Basecall_Stats_{}".format(rndProject["fc_id"])
    createDir(Unaligned_path, BaseCall_stats_dir)
    Project_dir             = "Project_{}".format(rndProject["project_name_ill"])
    createDir(Unaligned_path, Project_dir)
    #need to create samples now
    Project_path = os.path.join( Unaligned_path, Project_dir)
    rndSamplePos = 0;


    for originalSample in originalProject["samples_id"]:
        rndSample     = rndProject["samples_id"][rndSamplePos]
        sample_dir    = "Sample_{}_{}".format(rndProject["project_id"], rndSample)
        createDir(Project_path, sample_dir)
        Sample_path  = os.path.join( Project_path, sample_dir)
        #now hard link or sub-samples fastq files
        originalProject_dir = "Project_{}".format(originalProject["project_name_ill"])
        originalSampleDir   = "Sample_{}_{}".format(originalProject["project_id"], originalSample)
        originalSamplePath  = os.path.join(originalProject["fc_dir"] , "Unaligned_16bp", originalProject_dir, originalSampleDir)
        pairs_to_extract_per_lane  = 0
        
        ##create new sample
        sample_url = charon_session.construct_charon_url("sample", rndProject["project_id"])
        sample_dict = {'sampleid': "{}_{}".format(rndProject["project_id"], rndSample),
               'status':'NEW',
               'received':'2014-04-17',
               'qc_status': 'NEW',
               'genotyping_status': None,
               'genotyping_concordance': None,
               'lims_initial_qc': 'Passed',
               'total_autosomal_coverage': 0,
               'total_sequenced_reads': 0
               }
        charon_session.post(sample_url, json.dumps(sample_dict))
        #create new library prep
        libprep_url = charon_session.construct_charon_url("libprep", rndProject["project_id"], "{}_{}".format(rndProject["project_id"], rndSample))
        libprep_dict = {'libprepid': "A",
               'limsid':'24-44506',
               'status':'NEW'
               }
        charon_session.post(libprep_url, json.dumps(libprep_dict))
        #create seq run


        seqrun_url = charon_session.construct_charon_url("seqrun", rndProject["project_id"], "{}_{}".format(rndProject["project_id"], rndSample), "A")
        seqrun_dict = {'seqrunid': rnd_fc_name  ,
               'sequencing_status':'DONE' ,
#               'mean_autosomal_coverage' : 0
                }
        charon_session.post(seqrun_url, json.dumps(seqrun_dict))




        if args.sample_cov > 0:
            #I know that I have 8 lanes
            reads_to_extract          = (args.sample_cov* 3200000000)/125
            pairs_to_extract          = reads_to_extract/2
            pairs_to_extract_per_lane = pairs_to_extract/8
        
        for fastq in [fastq for fastq in listdir(originalSamplePath) if isfile(join(originalSamplePath,fastq)) and fastq.endswith("fastq.gz")]:
            originalFastq = os.path.join(originalSamplePath, fastq)
            
            rndFastqName  = fastq.replace("{}_{}".format(originalProject["project_id"],  originalSample),
                                          "{}_{}".format(rndProject["project_id"], rndSample))
            rndFastq      = os.path.join(Sample_path , rndFastqName)
            if args.sample_cov == 0:
                os.link(originalFastq, rndFastq)
            else:
                downsample(originalFastq, rndFastq,  pairs_to_extract_per_lane)

        rndSamplePos += 1
                
                
    createDir(Unaligned_dir, "Temp")
    # I try to not consider these guys here
    createDir(Unaligned_dir, "Undetermined_indices")
    # I try to not consider these guys here

    produceRunInfo(rndProject["fc_dir"], rnd_fc_name, rnd_fc_id_noplate, rnd_instrument, rnd_date)
    os.link("/proj/a2010002/INBOX/140702_D00415_0052_AC41A2ANXX/runParameters.xml", os.path.join(rnd_fc_path, "runParameters.xml"))
Example #13
0
 def test_get_project_id_from_name(self):
     # Check that it matches
     self.assertEqual(self.project_id, get_project_id_from_name(self.project_name))
Example #14
0
def recreate_project_from_filesystem(project_dir,
                                     restrict_to_samples=None,
                                     restrict_to_libpreps=None,
                                     restrict_to_seqruns=None):
    """Recreates the full project/sample/libprep/seqrun set of
    NGIObjects using the directory tree structure."""

    if not restrict_to_samples: restrict_to_samples = []
    if not restrict_to_libpreps: restrict_to_libpreps = []
    if not restrict_to_seqruns: restrict_to_seqruns = []

    base_path, project_name = os.path.split(project_dir)
    if not project_name:
        base_path, project_name = os.path.split(base_path)
    LOG.info('Setting up project "{}"'.format(project_name))
    try:
        # This requires Charon access -- maps e.g. "Y.Mom_14_01" to "P123"
        project_id = get_project_id_from_name(project_name)
    # Should handle requests.exceptions.Timeout in Charon classes
    except (CharonError, ValueError, Timeout) as e:
        error_msg = ('Cannot proceed with project "{}" due to '
                     'Charon-related error: {}'.format(project_name, e))
        raise CharonError(error_msg)
    project_obj = NGIProject(name=project_name,
                             dirname=project_name,
                             project_id=project_id,
                             base_path=base_path)

    samples_pattern = os.path.join(project_dir, "*")
    samples = filter(os.path.isdir, glob.glob(samples_pattern))
    if not samples:
        LOG.warn('No samples found for project "{}"'.format(project_obj))
    for sample_dir in samples:
        sample_name = os.path.basename(sample_dir)
        if restrict_to_samples and sample_name not in restrict_to_samples:
            LOG.debug('Skipping sample "{}": not in specified samples "{}"'.format(sample_name, ', '.join(restrict_to_samples)))
            continue
        LOG.info('Setting up sample "{}"'.format(sample_name))
        sample_obj = project_obj.add_sample(name=sample_name, dirname=sample_name)

        libpreps_pattern = os.path.join(sample_dir, "*")
        libpreps = filter(os.path.isdir, glob.glob(libpreps_pattern))
        if not libpreps:
            LOG.warn('No libpreps found for sample "{}"'.format(sample_obj))
        for libprep_dir in libpreps:
            libprep_name = os.path.basename(libprep_dir)
            if restrict_to_libpreps and libprep_name not in restrict_to_libpreps:
                LOG.debug('Skipping libprep "{}": not in specified libpreps "{}"'.format(libprep_name, ', '.join(restrict_to_libpreps)))
                continue
            LOG.info('Setting up libprep "{}"'.format(libprep_name))
            libprep_obj = sample_obj.add_libprep(name=libprep_name,
                                                    dirname=libprep_name)

            seqruns_pattern = os.path.join(libprep_dir, "*_*_*_*")
            seqruns = filter(os.path.isdir, glob.glob(seqruns_pattern))
            if not seqruns:
                LOG.warn('No seqruns found for libprep "{}"'.format(libprep_obj))
            for seqrun_dir in seqruns:
                seqrun_name = os.path.basename(seqrun_dir)
                if restrict_to_seqruns and seqrun_name not in restrict_to_seqruns:
                    LOG.debug('Skipping seqrun "{}": not in specified seqruns "{}"'.format(seqrun_name, ', '.join(restrict_to_seqruns)))
                    continue
                LOG.info('Setting up seqrun "{}"'.format(seqrun_name))
                seqrun_obj = libprep_obj.add_seqrun(name=seqrun_name,
                                                          dirname=seqrun_name)
                pattern = re.compile(".*\.(fastq|fq)(\.gz|\.gzip|\.bz2)?$")
                all_files = glob.glob(os.path.join(seqrun_dir, "*"))
                fastq_files = filter(os.path.isfile, filter(pattern.match, all_files))
                for fq_file in fastq_files:
                    fq_name = os.path.basename(fq_file)
                    LOG.info('Adding fastq file "{}" to seqrun "{}"'.format(fq_name, seqrun_obj))
                    seqrun_obj.add_fastq_files([fq_name])
    return project_obj