コード例 #1
0
def to_hdf5(fp, h5file, max_barcode_length=12):
    """Represent demux data in an h5file

    Parameters
    ----------
    fp : filepath
        The filepath containing either FASTA or FASTQ data.
    h5file : h5py.File
        The file to write into.

    Notes
    -----
    A group, per sample, will be created and within that group, 5 datasets will
    be constructed that correspond to sequence, original_barcode,
    corrected_barcode, barcode_errors, and qual.

    The filepath is required as two passes over the file are essential.

    The expectation is that the filepath being operated on is the result of
    split_libraries.py or split_libraries_fastq.py from QIIME. This code makes
    assumptions about items in the comment line that are added by split
    libraries. Specifically, the code looks for a "new_bc", an "ori_bc" and a
    "bc_diffs" field, and additionally assumes the sample ID is encoded in the
    ID.
    """
    # walk over the file and collect summary stats
    sample_stats, full_stats = _summarize_lengths(_per_sample_lengths(fp))

    # construct the datasets, storing per sample stats and full file stats
    buffers = _construct_datasets(sample_stats, h5file)
    _set_attr_stats(h5file, full_stats)
    h5file.attrs['has-qual'] = _has_qual(fp)

    for rec in load(fp):
        result = search((r'^(?P<sample>.+?)_\d+? .*orig_bc=(?P<orig_bc>.+?) '
                         'new_bc=(?P<corr_bc>.+?) bc_diffs=(?P<bc_diffs>\d+)'),
                        rec['SequenceID'])

        if result is None:
            raise ValueError("%s doesn't appear to be split libraries "
                             "output!" % fp)

        sample = result.group('sample')
        bc_diffs = result.group('bc_diffs')
        corr_bc = result.group('corr_bc')
        orig_bc = result.group('orig_bc')

        sequence = rec['Sequence']
        qual = rec['Qual']

        pjoin = partial(os.path.join, sample)
        buffers[pjoin(dset_paths['sequence'])].write(sequence)
        buffers[pjoin(dset_paths['barcode_original'])].write(orig_bc)
        buffers[pjoin(dset_paths['barcode_corrected'])].write(corr_bc)
        buffers[pjoin(dset_paths['barcode_error'])].write(bc_diffs)

        if qual is not None:
            buffers[pjoin(dset_paths['qual'])].write(qual)
コード例 #2
0
def _per_sample_lengths(fp):
    """Determine the lengths of all sequences per sample

    Parameters
    ----------
    fp : filepath
        The sequence file to walk over

    Returns
    -------
    dict
        {sample_id: [sequence_length]}
    """
    lengths = defaultdict(list)
    for record in load(fp):
        sample_id = record['SequenceID'].split(' ')[0].rsplit('_', 1)[0]
        lengths[sample_id].append(len(record['Sequence']))

    return lengths
コード例 #3
0
def validate(qclient, job_id, parameters, out_dir):
    """Validate and fix a new BIOM artifact

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values to validate and create the artifact
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    bool, list of qiita_client.ArtifactInfo , str
        Whether the job is successful
        The artifact information, if successful
        The error message, if not successful
    """
    prep_id = parameters.get('template')
    analysis_id = parameters.get('analysis')
    files = loads(parameters['files'])
    a_type = parameters['artifact_type']

    if a_type != "BIOM":
        return (False, None, "Unknown artifact type %s. Supported types: BIOM"
                             % a_type)

    qclient.update_job_step(job_id, "Step 1: Collecting metadata")
    if prep_id is not None:
        metadata = qclient.get("/qiita_db/prep_template/%s/data/" % prep_id)
        metadata = metadata['data']
    elif analysis_id is not None:
        metadata = qclient.get("/qiita_db/analysis/%s/metadata/" % analysis_id)
    else:
        return (False, None, "Missing metadata information")

    # Check if the biom table has the same sample ids as the prep info
    qclient.update_job_step(job_id, "Step 2: Validting BIOM file")
    new_biom_fp = biom_fp = files['biom'][0]
    table = load_table(biom_fp)
    metadata_ids = set(metadata)
    biom_sample_ids = set(table.ids())

    if not metadata_ids.issuperset(biom_sample_ids):
        # The BIOM sample ids are different from the ones in the prep template
        qclient.update_job_step(job_id, "Step 3: Fixing BIOM sample ids")
        # Attempt 1: the user provided the run prefix column - in this case
        # the run prefix column holds the sample ids present in the BIOM file
        if 'run_prefix' in metadata[next(iter(metadata_ids))]:
            id_map = {v['run_prefix']: k for k, v in metadata.items()}
        else:
            # Attemp 2: the sample ids in the BIOM table are the same that in
            # the prep template but without the prefix
            prefix = next(iter(metadata_ids)).split('.', 1)[0]
            prefixed = set("%s.%s" % (prefix, s) for s in biom_sample_ids)
            if metadata_ids.issuperset(prefixed):
                id_map = {s: "%s.%s" % (prefix, s) for s in biom_sample_ids}
            else:
                # There is nothing we can do. The samples in the BIOM table do
                # not match the ones in the prep template and we can't fix it
                error_msg = ('The sample ids in the BIOM table do not match '
                             'the ones in the prep information. Please, '
                             'provide the column "run_prefix" in the prep '
                             'information to map the existing sample ids to '
                             'the prep information sample ids.')
                return False, None, error_msg

        # Fix the sample ids
        try:
            table.update_ids(id_map, axis='sample')
        except TableException:
            missing = biom_sample_ids - set(id_map)
            error_msg = ('Your prep information is missing samples that are '
                         'present in your BIOM table: %s' % ', '.join(missing))
            return False, None, error_msg

        new_biom_fp = join(out_dir, basename(biom_fp))
        with biom_open(new_biom_fp, 'w') as f:
            table.to_hdf5(f, "Qiita BIOM type plugin")

    filepaths = [(new_biom_fp, 'biom')]

    # Validate the representative set, if it exists
    if 'preprocessed_fasta' in files:
        repset_fp = files['preprocessed_fasta'][0]

        # The observations ids of the biom table should be the same
        # as the representative sequences ids found in the representative set
        observation_ids = table.ids(axis='observation').tolist()
        extra_ids = []
        for record in load([repset_fp], constructor=FastaIterator):
            rec_id = record['SequenceID'].split()[0]
            try:
                observation_ids.remove(rec_id)
            except ValueError:
                extra_ids.append(rec_id)

        error_msg = []
        if extra_ids:
            error_msg.append("The representative set sequence file includes "
                             "observations not found in the BIOM table: %s"
                             % ', '.join(extra_ids))
        if observation_ids:
            error_msg.append("The representative set sequence file is missing "
                             "observation ids found in the BIOM tabe: %s" %
                             ', '.join(observation_ids))

        if error_msg:
            return False, None, '\n'.join(error_msg)

        filepaths.append((repset_fp, 'preprocessed_fasta'))

    for fp_type, fps in files.items():
        if fp_type not in ('biom', 'preprocessed_fasta'):
            for fp in fps:
                filepaths.append((fp, fp_type))

    return True, [ArtifactInfo(None, 'BIOM', filepaths)], ""
コード例 #4
0
def _has_qual(fp):
    """Check if it looks like we have qual"""
    iter_ = load(fp)
    rec = next(iter(iter_))
    return rec['Qual'] is not None
コード例 #5
0
ファイル: validate.py プロジェクト: qiita-spots/qtp-biom
def validate(qclient, job_id, parameters, out_dir):
    """Validate and fix a new BIOM artifact

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    parameters : dict
        The parameter values to validate and create the artifact
    out_dir : str
        The path to the job's output directory

    Returns
    -------
    bool, list of qiita_client.ArtifactInfo , str
        Whether the job is successful
        The artifact information, if successful
        The error message, if not successful
    """
    prep_id = parameters.get('template')
    analysis_id = parameters.get('analysis')
    files = loads(parameters['files'])
    a_type = parameters['artifact_type']

    if a_type != "BIOM":
        return (False, None, "Unknown artifact type %s. Supported types: BIOM"
                             % a_type)

    qclient.update_job_step(job_id, "Step 1: Collecting metadata")
    if prep_id is not None:
        is_analysis = False
        metadata = qclient.get("/qiita_db/prep_template/%s/data/" % prep_id)
        metadata = metadata['data']

        qurl = ('/qiita_db/prep_template/%s/' % prep_id)
        md = qclient.get(qurl)['qiime-map']
    elif analysis_id is not None:
        is_analysis = True
        metadata = qclient.get("/qiita_db/analysis/%s/metadata/" % analysis_id)

        md = metadata
    else:
        return (False, None, "Missing metadata information")

    # Check if the biom table has the same sample ids as the prep info
    qclient.update_job_step(job_id, "Step 2: Validating BIOM file")
    new_biom_fp = biom_fp = files['biom'][0]
    table = load_table(biom_fp)
    metadata_ids = set(metadata)
    biom_sample_ids = set(table.ids())

    if not metadata_ids.issuperset(biom_sample_ids):
        # The BIOM sample ids are different from the ones in the prep template
        qclient.update_job_step(job_id, "Step 3: Fixing BIOM sample ids")
        # Attempt 1: the user provided the run prefix column - in this case
        # the run prefix column holds the sample ids present in the BIOM file
        if 'run_prefix' in metadata[next(iter(metadata_ids))]:
            id_map = {v['run_prefix']: k for k, v in metadata.items()}
        else:
            # Attemp 2: the sample ids in the BIOM table are the same that in
            # the prep template but without the prefix
            prefix = next(iter(metadata_ids)).split('.', 1)[0]
            prefixed = set("%s.%s" % (prefix, s) for s in biom_sample_ids)
            if metadata_ids.issuperset(prefixed):
                id_map = {s: "%s.%s" % (prefix, s) for s in biom_sample_ids}
            else:
                # There is nothing we can do. The samples in the BIOM table do
                # not match the ones in the prep template and we can't fix it
                error_msg = ('The sample ids in the BIOM table do not match '
                             'the ones in the prep information. Please, '
                             'provide the column "run_prefix" in the prep '
                             'information to map the existing sample ids to '
                             'the prep information sample ids.')
                return False, None, error_msg

        # Fix the sample ids
        try:
            table.update_ids(id_map, axis='sample')
        except TableException:
            missing = biom_sample_ids - set(id_map)
            error_msg = ('Your prep information is missing samples that are '
                         'present in your BIOM table: %s' % ', '.join(missing))
            return False, None, error_msg

        new_biom_fp = join(out_dir, basename(biom_fp))
        with biom_open(new_biom_fp, 'w') as f:
            table.to_hdf5(f, "Qiita BIOM type plugin")

    filepaths = [(new_biom_fp, 'biom')]

    # Validate the representative set, if it exists
    if 'preprocessed_fasta' in files:
        repset_fp = files['preprocessed_fasta'][0]

        # The observations ids of the biom table should be the same
        # as the representative sequences ids found in the representative set
        observation_ids = table.ids(axis='observation').tolist()
        extra_ids = []
        for record in load([repset_fp], constructor=FastaIterator):
            rec_id = record['SequenceID'].split()[0]
            try:
                observation_ids.remove(rec_id)
            except ValueError:
                extra_ids.append(rec_id)

        error_msg = []
        if extra_ids:
            error_msg.append("The representative set sequence file includes "
                             "observations not found in the BIOM table: %s"
                             % ', '.join(extra_ids))
        if observation_ids:
            error_msg.append("The representative set sequence file is missing "
                             "observation ids found in the BIOM tabe: %s" %
                             ', '.join(observation_ids))

        if error_msg:
            return False, None, '\n'.join(error_msg)

        filepaths.append((repset_fp, 'preprocessed_fasta'))

    # Validate the sequence specific phylogenetic tree (e.g. generated
    # by SEPP for Deblur), if it exists
    tree = None
    if 'plain_text' in files:
        phylogeny_fp = files['plain_text'][0]

        try:
            tree = TreeNode.read(phylogeny_fp)
            filepaths.append((phylogeny_fp, 'plain_text'))
        except Exception:
            return False, None, ("Phylogenetic tree cannot be parsed "
                                 "via scikit-biom")

    for fp_type, fps in files.items():
        if fp_type not in ('biom', 'preprocessed_fasta', 'plain_text'):
            for fp in fps:
                filepaths.append((fp, fp_type))

    index_fp, viz_fp, qza_fp = _generate_html_summary(
        new_biom_fp, md, join(out_dir), is_analysis, tree)

    filepaths.append((index_fp, 'html_summary'))
    filepaths.append((viz_fp, 'html_summary_dir'))
    if 'qza' not in files:
        filepaths.append((qza_fp, 'qza'))

    return True, [ArtifactInfo(None, 'BIOM', filepaths)], ""