Example #1
0
def collect_files_for_sample_analysis(project_obj, sample_obj, 
                                      restart_finished_jobs=False):
    """This function finds all data files relating to a sample and 
    follows a preset decision path to decide which of them to include in
    a sample-level analysis. This can include fastq files, bam files, and
    alignment-qc-level files.
    """
    ### FASTQ
    # Access the filesystem to determine what fastq files are available
    # For each file, validate it.

    # This funtion goes into Charon and finds all valid libpreps and seqruns,
    # dvs libpreps for which               'qc' != "FAILED"
    # and seqruns  for which 'alignment_status' != "DONE"
    valid_libprep_seqruns = get_valid_seqruns_for_sample(project_id=project_obj.project_id,
                                                         sample_id=sample_obj.name,
                                                         include_failed_libpreps=False,
                                                         include_done_seqruns=restart_finished_jobs)
    if not valid_libprep_seqruns: LOG.error("Notify user or whatever. I don't know.")

    # Now we find all fastq files that are available and validate them against
    # the group compiled in the previous step (get_valid_seqruns_for_sample)
    # We're going to recreate NGIProject/NGISample/NGILibraryPrep/NGISeqrun objects here
    sample_data_directory = os.path.join(project_obj.base_path, "DATA",
                                         project_obj.dirname, sample_obj.dirname)
    fastq_files_on_filesystem = fastq_files_under_dir(sample_data_directory, realpath=False)
    if not fastq_files_on_filesystem: LOG.error("TODO raise an error or something")

    fastq_files_to_analyze = []
    # Create a new NGIProject object (the old one could still be in use elsewhere)
    # Fix this later I've been coding for too long
    proj_obj = NGIProject(project_obj.name, project_obj.dirname,
                          project_obj.project_id, project_obj.base_path)
    sample_obj = proj_obj.add_sample(sample_obj.name, sample_obj.dirname)
    for fastq_path in fastq_files_on_filesystem:
        base_path, fastq = os.path.split(fastq_path)
        if not fastq:
            base_path, fastq = os.path.split(base_path) # Handles trailing slash
        base_path, fs_seqrun_name = os.path.split(base_path)
        base_path, fs_libprep_name = os.path.split(base_path)
        if fs_libprep_name not in valid_libprep_seqruns.keys():
            # Invalid library prep, skip this fastq file
            continue
        elif fs_seqrun_name not in valid_libprep_seqruns.get(fs_libprep_name, []):
            continue
        else:
            libprep_obj = sample_obj.add_libprep(name=fs_libprep_name, dirname=fs_libprep_name)
            seqrun_obj = libprep_obj.add_seqrun(name=fs_seqrun_name, dirname=fs_seqrun_name)
            seqrun_obj.add_fastq_files(fastq)

    ### BAM / ALIGNMENT QC
    # Access the filesystem to determine which alignment (bam) files are available.
    # If there are any, add them to the list of files to include in the new analysis.
    # Include alignment qc files.
    project_analysis_dir = os.path.join(project_obj.base_path, "ANALYSIS", project_obj.dirname)
    project_aln_dir = os.path.join(project_analysis_dir, "01_raw_alignments")
    project_alnqc_dir = os.path.join(project_analysis_dir, "02_preliminary_alignment_qc")
    sample_analysis_file_pattern = "{sample_name}.*.{sample_name}.*".format(sample_name=sample_obj.name)
    aln_files_to_copy = glob.glob(os.path.join(project_aln_dir, sample_analysis_file_pattern))
    qc_files_to_copy = glob.glob(os.path.join(project_alnqc_dir, sample_analysis_file_pattern))

    return (proj_obj, aln_files_to_copy, qc_files_to_copy)
def collect_files_for_sample_analysis(project_obj, sample_obj, 
                                      restart_finished_jobs=False,
                                      status_field="alignment_status"):
    """This function finds all data files relating to a sample and
    follows a preset decision path to decide which of them to include in
    a sample-level analysis. This can include fastq files, bam files, and
    alignment-qc-level files.
    Doesn't modify existing project or sample objects; returns new copies.

    :param NGIProject project_obj: The NGIProject object to process
    :param NGISample sample_obj: The NGISample object to process
    :param bool restart_finished_jobs: Include jobs marked as "DONE" (default False)
    :param str status_field: Which Charon status field to check (alignment, genotype)

    :returns: A new NGIProject object, a list of alignment and qc files
    :rtype: NGIProject, list, list

    :raises ValueError: If there are no valid libpreps, seqruns, or fastq files
    """
    ### FASTQ
    # Access the filesystem to determine what fastq files are available
    # For each file, validate it.

    # This funtion goes into Charon and finds all valid libpreps and seqruns,
    # dvs libpreps for which               'qc' != "FAILED"
    # and seqruns  for which 'alignment_status' != "DONE"
    valid_libprep_seqruns = \
            get_valid_seqruns_for_sample(project_id=project_obj.project_id,
                                         sample_id=sample_obj.name,
                                         include_failed_libpreps=False,
                                         include_done_seqruns=restart_finished_jobs,
                                         status_field=status_field)
    if not valid_libprep_seqruns:
        raise ValueError('No valid libpreps/seqruns found for project/sample '
                         '"{}/{}"'.format(project_obj, sample_obj))

    # Now we find all fastq files that are available and validate them against
    # the group compiled in the previous step (get_valid_seqruns_for_sample)
    # We're going to recreate NGIProject/NGISample/NGILibraryPrep/NGISeqrun objects here
    sample_data_directory = os.path.join(project_obj.base_path, "DATA",
                                         project_obj.dirname, sample_obj.dirname)
    fastq_files_on_filesystem = fastq_files_under_dir(sample_data_directory, realpath=False)
    if not fastq_files_on_filesystem:
        raise ValueError('No valid fastq files found for project/sample '
                         '{}/{}'.format(project_obj, sample_obj))

    # Create a new NGIProject object (the old one could still be in use elsewhere)
    proj_obj = NGIProject(project_obj.name, project_obj.dirname,
                          project_obj.project_id, project_obj.base_path)
    sample_obj = proj_obj.add_sample(sample_obj.name, sample_obj.dirname)
    for fastq_path in fastq_files_on_filesystem:
        base_path, fastq = os.path.split(fastq_path)
        if not fastq:
            base_path, fastq = os.path.split(base_path) # Handles trailing slash
        base_path, fs_seqrun_name = os.path.split(base_path)
        base_path, fs_libprep_name = os.path.split(base_path)
        if fs_libprep_name not in valid_libprep_seqruns.keys():
            # Invalid library prep, skip this fastq file
            continue
        elif fs_seqrun_name not in valid_libprep_seqruns.get(fs_libprep_name, []):
            continue
        else:
            libprep_obj = sample_obj.add_libprep(name=fs_libprep_name, dirname=fs_libprep_name)
            seqrun_obj = libprep_obj.add_seqrun(name=fs_seqrun_name, dirname=fs_seqrun_name)
            seqrun_obj.add_fastq_files(fastq)

    ### EXISTING DATA
    # If we still have data here at this point, we'll copy it over. If we had
    # decided to scrap it, it would have been deleted already.
    files_to_copy = find_previous_sample_analyses(proj_obj, sample_obj)

    return (proj_obj, files_to_copy)
Example #3
0
def collect_files_for_sample_analysis(project_obj,
                                      sample_obj,
                                      restart_finished_jobs=False,
                                      status_field="alignment_status"):
    """This function finds all data files relating to a sample and
    follows a preset decision path to decide which of them to include in
    a sample-level analysis. This can include fastq files, bam files, and
    alignment-qc-level files.
    Doesn't modify existing project or sample objects; returns new copies.

    :param NGIProject project_obj: The NGIProject object to process
    :param NGISample sample_obj: The NGISample object to process
    :param bool restart_finished_jobs: Include jobs marked as "DONE" (default False)
    :param str status_field: Which Charon status field to check (alignment, genotype)

    :returns: A new NGIProject object, a list of alignment and qc files
    :rtype: NGIProject, list, list

    :raises ValueError: If there are no valid libpreps, seqruns, or fastq files
    """
    ### FASTQ
    # Access the filesystem to determine what fastq files are available
    # For each file, validate it.

    # This funtion goes into Charon and finds all valid libpreps and seqruns,
    # dvs libpreps for which               'qc' != "FAILED"
    # and seqruns  for which 'alignment_status' != "DONE"
    valid_libprep_seqruns = \
            get_valid_seqruns_for_sample(project_id=project_obj.project_id,
                                         sample_id=sample_obj.name,
                                         include_failed_libpreps=False,
                                         include_done_seqruns=restart_finished_jobs,
                                         status_field=status_field)
    if not valid_libprep_seqruns:
        raise ValueError('No valid libpreps/seqruns found for project/sample '
                         '"{}/{}"'.format(project_obj, sample_obj))

    # Now we find all fastq files that are available and validate them against
    # the group compiled in the previous step (get_valid_seqruns_for_sample)
    # We're going to recreate NGIProject/NGISample/NGILibraryPrep/NGISeqrun objects here
    sample_data_directory = os.path.join(project_obj.base_path, "DATA",
                                         project_obj.dirname,
                                         sample_obj.dirname)
    fastq_files_on_filesystem = fastq_files_under_dir(sample_data_directory,
                                                      realpath=False)
    if not fastq_files_on_filesystem:
        raise ValueError('No valid fastq files found for project/sample '
                         '{}/{}'.format(project_obj, sample_obj))

    # Create a new NGIProject object (the old one could still be in use elsewhere)
    proj_obj = NGIProject(project_obj.name, project_obj.dirname,
                          project_obj.project_id, project_obj.base_path)
    sample_obj = proj_obj.add_sample(sample_obj.name, sample_obj.dirname)
    for fastq_path in fastq_files_on_filesystem:
        base_path, fastq = os.path.split(fastq_path)
        if not fastq:
            base_path, fastq = os.path.split(
                base_path)  # Handles trailing slash
        base_path, fs_seqrun_name = os.path.split(base_path)
        base_path, fs_libprep_name = os.path.split(base_path)
        if fs_libprep_name not in valid_libprep_seqruns.keys():
            # Invalid library prep, skip this fastq file
            continue
        elif fs_seqrun_name not in valid_libprep_seqruns.get(
                fs_libprep_name, []):
            continue
        else:
            libprep_obj = sample_obj.add_libprep(name=fs_libprep_name,
                                                 dirname=fs_libprep_name)
            seqrun_obj = libprep_obj.add_seqrun(name=fs_seqrun_name,
                                                dirname=fs_seqrun_name)
            seqrun_obj.add_fastq_files(fastq)

    ### EXISTING DATA
    # If we still have data here at this point, we'll copy it over. If we had
    # decided to scrap it, it would have been deleted already.
    files_to_copy = find_previous_sample_analyses(proj_obj, sample_obj)

    return (proj_obj, files_to_copy)
Example #4
0
def collect_files_for_sample_analysis(project_obj,
                                      sample_obj,
                                      restart_finished_jobs=False):
    """This function finds all data files relating to a sample and 
    follows a preset decision path to decide which of them to include in
    a sample-level analysis. This can include fastq files, bam files, and
    alignment-qc-level files.
    """
    ### FASTQ
    # Access the filesystem to determine what fastq files are available
    # For each file, validate it.

    # This funtion goes into Charon and finds all valid libpreps and seqruns,
    # dvs libpreps for which               'qc' != "FAILED"
    # and seqruns  for which 'alignment_status' != "DONE"
    valid_libprep_seqruns = get_valid_seqruns_for_sample(
        project_id=project_obj.project_id,
        sample_id=sample_obj.name,
        include_failed_libpreps=False,
        include_done_seqruns=restart_finished_jobs)
    if not valid_libprep_seqruns:
        LOG.error("Notify user or whatever. I don't know.")

    # Now we find all fastq files that are available and validate them against
    # the group compiled in the previous step (get_valid_seqruns_for_sample)
    # We're going to recreate NGIProject/NGISample/NGILibraryPrep/NGISeqrun objects here
    sample_data_directory = os.path.join(project_obj.base_path, "DATA",
                                         project_obj.dirname,
                                         sample_obj.dirname)
    fastq_files_on_filesystem = fastq_files_under_dir(sample_data_directory,
                                                      realpath=False)
    if not fastq_files_on_filesystem:
        LOG.error("TODO raise an error or something")

    fastq_files_to_analyze = []
    # Create a new NGIProject object (the old one could still be in use elsewhere)
    # Fix this later I've been coding for too long
    proj_obj = NGIProject(project_obj.name, project_obj.dirname,
                          project_obj.project_id, project_obj.base_path)
    sample_obj = proj_obj.add_sample(sample_obj.name, sample_obj.dirname)
    for fastq_path in fastq_files_on_filesystem:
        base_path, fastq = os.path.split(fastq_path)
        if not fastq:
            base_path, fastq = os.path.split(
                base_path)  # Handles trailing slash
        base_path, fs_seqrun_name = os.path.split(base_path)
        base_path, fs_libprep_name = os.path.split(base_path)
        if fs_libprep_name not in valid_libprep_seqruns.keys():
            # Invalid library prep, skip this fastq file
            continue
        elif fs_seqrun_name not in valid_libprep_seqruns.get(
                fs_libprep_name, []):
            continue
        else:
            libprep_obj = sample_obj.add_libprep(name=fs_libprep_name,
                                                 dirname=fs_libprep_name)
            seqrun_obj = libprep_obj.add_seqrun(name=fs_seqrun_name,
                                                dirname=fs_seqrun_name)
            seqrun_obj.add_fastq_files(fastq)

    ### BAM / ALIGNMENT QC
    # Access the filesystem to determine which alignment (bam) files are available.
    # If there are any, add them to the list of files to include in the new analysis.
    # Include alignment qc files.
    project_analysis_dir = os.path.join(project_obj.base_path, "ANALYSIS",
                                        project_obj.dirname)
    project_aln_dir = os.path.join(project_analysis_dir, "01_raw_alignments")
    project_alnqc_dir = os.path.join(project_analysis_dir,
                                     "02_preliminary_alignment_qc")
    sample_analysis_file_pattern = "{sample_name}.*.{sample_name}.*".format(
        sample_name=sample_obj.name)
    aln_files_to_copy = glob.glob(
        os.path.join(project_aln_dir, sample_analysis_file_pattern))
    qc_files_to_copy = glob.glob(
        os.path.join(project_alnqc_dir, sample_analysis_file_pattern))

    return (proj_obj, aln_files_to_copy, qc_files_to_copy)