Example #1
0
 def test_force_constructor(self):
     it = load([self.noext], constructor=FastaIterator)
     obs = [rec.copy() for rec in it]
     exp = [{'Sequence': 'AATTGG', 'SequenceID': 'seq1',
             'Qual': None, 'QualID': None},
            {'Sequence': 'ATATA', 'SequenceID': 'seq2',
             'Qual': None, 'QualID': None}]
     self.assertEqual(obs, exp)
Example #2
0
def to_hdf5(fp, h5file, max_barcode_length=12):
    """Represent demux data in an h5file

    Parameters
    ----------
    fp : filepath
        The filepath containing either FASTA or FASTQ data.
    h5file : h5py.File
        The file to write into.

    Notes
    -----
    A group, per sample, will be created and within that group, 5 datasets will
    be constructed that correspond to sequence, original_barcode,
    corrected_barcode, barcode_errors, and qual.

    The filepath is required as two passes over the file are essential.

    The expectation is that the filepath being operated on is the result of
    split_libraries.py or split_libraries_fastq.py from QIIME. This code makes
    assumptions about items in the comment line that are added by split
    libraries. Specifically, the code looks for a "new_bc", an "ori_bc" and a
    "bc_diffs" field, and additionally assumes the sample ID is encoded in the
    ID.
    """
    # walk over the file and collect summary stats
    sample_stats, full_stats = _summarize_lengths(_per_sample_lengths(fp))

    # construct the datasets, storing per sample stats and full file stats
    buffers = _construct_datasets(sample_stats, h5file)
    _set_attr_stats(h5file, full_stats)
    h5file.attrs['has-qual'] = _has_qual(fp)

    for rec in load(fp):
        result = search((r'^(?P<sample>.+?)_\d+? .*orig_bc=(?P<orig_bc>.+?) '
                         'new_bc=(?P<corr_bc>.+?) bc_diffs=(?P<bc_diffs>\d+)'),
                        rec['SequenceID'])

        if result is None:
            raise ValueError("%s doesn't appear to be split libraries "
                             "output!" % fp)

        sample = result.group('sample')
        bc_diffs = result.group('bc_diffs')
        corr_bc = result.group('corr_bc')
        orig_bc = result.group('orig_bc')

        sequence = rec['Sequence']
        qual = rec['Qual']

        pjoin = partial(os.path.join, sample)
        buffers[pjoin(dset_paths['sequence'])].write(sequence)
        buffers[pjoin(dset_paths['barcode_original'])].write(orig_bc)
        buffers[pjoin(dset_paths['barcode_corrected'])].write(corr_bc)
        buffers[pjoin(dset_paths['barcode_error'])].write(bc_diffs)

        if qual is not None:
            buffers[pjoin(dset_paths['qual'])].write(qual)
Example #3
0
def to_hdf5(fp, h5file, max_barcode_length=12):
    """Represent demux data in an h5file

    Parameters
    ----------
    fp : filepath
        The filepath containing either FASTA or FASTQ data.
    h5file : h5py.File
        The file to write into.

    Notes
    -----
    A group, per sample, will be created and within that group, 5 datasets will
    be constructed that correspond to sequence, original_barcode,
    corrected_barcode, barcode_errors, and qual.

    The filepath is required as two passes over the file are essential.

    The expectation is that the filepath being operated on is the result of
    split_libraries.py or split_libraries_fastq.py from QIIME. This code makes
    assumptions about items in the comment line that are added by split
    libraries. Specifically, the code looks for a "new_bc", an "ori_bc" and a
    "bc_diffs" field, and additionally assumes the sample ID is encoded in the
    ID.
    """
    # walk over the file and collect summary stats
    sample_stats, full_stats = _summarize_lengths(_per_sample_lengths(fp))

    # construct the datasets, storing per sample stats and full file stats
    buffers = _construct_datasets(sample_stats, h5file)
    _set_attr_stats(h5file, full_stats)
    h5file.attrs['has-qual'] = _has_qual(fp)

    for rec in load(fp):
        result = search((r'^(?P<sample>.+?)_\d+? .*orig_bc=(?P<orig_bc>.+?) '
                         'new_bc=(?P<corr_bc>.+?) bc_diffs=(?P<bc_diffs>\d+)'),
                        rec['SequenceID'])

        if result is None:
            raise ValueError("%s doesn't appear to be split libraries "
                             "output!" % fp)

        sample = result.group('sample')
        bc_diffs = result.group('bc_diffs')
        corr_bc = result.group('corr_bc')
        orig_bc = result.group('orig_bc')

        sequence = rec['Sequence']
        qual = rec['Qual']

        pjoin = partial(os.path.join, sample)
        buffers[pjoin(dset_paths['sequence'])].write(sequence)
        buffers[pjoin(dset_paths['barcode_original'])].write(orig_bc)
        buffers[pjoin(dset_paths['barcode_corrected'])].write(corr_bc)
        buffers[pjoin(dset_paths['barcode_error'])].write(bc_diffs)

        if qual is not None:
            buffers[pjoin(dset_paths['qual'])].write(qual)
Example #4
0
    def test_transform(self):
        """load should pass transform methods to the iterators"""
        def rev_f(st):
            st['Sequence'] = st['Sequence'][::-1]
            st['Qual'] = st['Qual'][::-1] if st['Qual'] is not None else None

        it = load([self.fq1gz, self.fna1], transform=rev_f, phred_offset=64)
        obs = [rec.copy() for rec in it]
        exp = [{
            'Sequence': 'CGTA',
            'SequenceID': 's1',
            'QualID': 's1',
            'Qual': array([40, 40, 40, 40])
        }, {
            'Sequence': 'GGTTAA',
            'SequenceID': 's2',
            'QualID': 's2',
            'Qual': array([40, 40, 39, 39, 39, 39])
        }, {
            'Sequence': 'CGTA',
            'SequenceID': 's1',
            'QualID': None,
            'Qual': None
        }, {
            'Sequence': 'GGTTAA',
            'SequenceID': 's2',
            'QualID': None,
            'Qual': None
        }]

        o = obs[0]
        e = exp[0]
        self.assertEqual(o['Sequence'], e['Sequence'])
        self.assertEqual(o['SequenceID'], e['SequenceID'])
        self.assertEqual(o['QualID'], e['QualID'])
        self.assertTrue((o['Qual'] == e['Qual']).all())

        o = obs[1]
        e = exp[1]
        self.assertEqual(o['Sequence'], e['Sequence'])
        self.assertEqual(o['SequenceID'], e['SequenceID'])
        self.assertEqual(o['QualID'], e['QualID'])
        self.assertTrue((o['Qual'] == e['Qual']).all())

        o = obs[2]
        e = exp[2]
        self.assertEqual(o['Sequence'], e['Sequence'])
        self.assertEqual(o['SequenceID'], e['SequenceID'])
        self.assertEqual(o['QualID'], e['QualID'])
        self.assertEqual(o['Qual'], e['Qual'])

        o = obs[3]
        e = exp[3]
        self.assertEqual(o['Sequence'], e['Sequence'])
        self.assertEqual(o['SequenceID'], e['SequenceID'])
        self.assertEqual(o['QualID'], e['QualID'])
        self.assertEqual(o['Qual'], e['Qual'])
Example #5
0
    def test_multiple_files(self):
        """load should handle multiple files of different types"""
        it = load([self.fq1, self.fna1], phred_offset=64)
        obs = [rec.copy() for rec in it]
        exp = [{
            'Sequence': 'ATGC',
            'SequenceID': 's1',
            'QualID': 's1',
            'Qual': array([40, 40, 40, 40])
        }, {
            'Sequence': 'AATTGG',
            'SequenceID': 's2',
            'QualID': 's2',
            'Qual': array([39, 39, 39, 39, 40, 40])
        }, {
            'Sequence': 'ATGC',
            'SequenceID': 's1',
            'QualID': None,
            'Qual': None
        }, {
            'Sequence': 'AATTGG',
            'SequenceID': 's2',
            'QualID': None,
            'Qual': None
        }]

        o = obs[0]
        e = exp[0]
        self.assertEqual(o['Sequence'], e['Sequence'])
        self.assertEqual(o['SequenceID'], e['SequenceID'])
        self.assertEqual(o['QualID'], e['QualID'])
        self.assertTrue((o['Qual'] == e['Qual']).all())

        o = obs[1]
        e = exp[1]
        self.assertEqual(o['Sequence'], e['Sequence'])
        self.assertEqual(o['SequenceID'], e['SequenceID'])
        self.assertEqual(o['QualID'], e['QualID'])
        self.assertTrue((o['Qual'] == e['Qual']).all())

        o = obs[2]
        e = exp[2]
        self.assertEqual(o['Sequence'], e['Sequence'])
        self.assertEqual(o['SequenceID'], e['SequenceID'])
        self.assertEqual(o['QualID'], e['QualID'])
        self.assertEqual(o['Qual'], e['Qual'])

        o = obs[3]
        e = exp[3]
        self.assertEqual(o['Sequence'], e['Sequence'])
        self.assertEqual(o['SequenceID'], e['SequenceID'])
        self.assertEqual(o['QualID'], e['QualID'])
        self.assertEqual(o['Qual'], e['Qual'])
Example #6
0
    def test_single_files(self):
        """load should handle a single file, and can be gzipped"""
        it = load(self.fna1)
        obs = [rec.copy() for rec in it]
        exp = [{'Sequence': 'ATGC', 'SequenceID': 's1',
                'QualID': None, 'Qual': None},
               {'Sequence': 'AATTGG', 'SequenceID': 's2',
                'QualID': None, 'Qual': None}]
        self.assertEqual(obs, exp)
        it = load(self.fq1, phred_offset=64)
        obs = [rec.copy() for rec in it]
        exp = [{'Sequence': 'ATGC', 'SequenceID': 's1',
                'QualID': 's1', 'Qual': array([40, 40, 40, 40])},
               {'Sequence': 'AATTGG', 'SequenceID': 's2',
                'QualID': 's2', 'Qual': array([39, 39, 39, 39, 40, 40])}]
        for o, e in zip(obs, exp):
            self.assertEqual(o['Sequence'], e['Sequence'])
            self.assertEqual(o['SequenceID'], e['SequenceID'])
            self.assertEqual(o['QualID'], e['QualID'])
            self.assertTrue((o['Qual'] == e['Qual']).all())

        it = load(self.fna1gz)
        obs = [rec.copy() for rec in it]
        exp = [{'Sequence': 'ATGC', 'SequenceID': 's1',
                'QualID': None, 'Qual': None},
               {'Sequence': 'AATTGG', 'SequenceID': 's2',
                'QualID': None, 'Qual': None}]
        self.assertEqual(obs, exp)

        it = load(self.fq1gz, phred_offset=64)
        obs = [rec.copy() for rec in it]
        exp = [{'Sequence': 'ATGC', 'SequenceID': 's1',
                'QualID': 's1', 'Qual': array([40, 40, 40, 40])},
               {'Sequence': 'AATTGG', 'SequenceID': 's2',
                'QualID': 's2', 'Qual': array([39, 39, 39, 39, 40, 40])}]
        for o, e in zip(obs, exp):
            self.assertEqual(o['Sequence'], e['Sequence'])
            self.assertEqual(o['SequenceID'], e['SequenceID'])
            self.assertEqual(o['QualID'], e['QualID'])
            self.assertTrue((o['Qual'] == e['Qual']).all())
Example #7
0
 def test_force_constructor(self):
     it = load([self.noext], constructor=FastaIterator)
     obs = [rec.copy() for rec in it]
     exp = [{
         'Sequence': 'AATTGG',
         'SequenceID': 'seq1',
         'Qual': None,
         'QualID': None
     }, {
         'Sequence': 'ATATA',
         'SequenceID': 'seq2',
         'Qual': None,
         'QualID': None
     }]
     self.assertEqual(obs, exp)
Example #8
0
    def test_transform(self):
        """load should pass transform methods to the iterators"""
        def rev_f(st):
            st['Sequence'] = st['Sequence'][::-1]
            st['Qual'] = st['Qual'][::-1] if st['Qual'] is not None else None

        it = load([self.fq1gz, self.fna1], transform=rev_f, phred_offset=64)
        obs = [rec.copy() for rec in it]
        exp = [{'Sequence': 'CGTA', 'SequenceID': 's1',
                'QualID': 's1', 'Qual': array([40, 40, 40, 40])},
               {'Sequence': 'GGTTAA', 'SequenceID': 's2',
                'QualID': 's2', 'Qual': array([40, 40, 39, 39, 39, 39])},
               {'Sequence': 'CGTA', 'SequenceID': 's1',
                'QualID': None, 'Qual': None},
               {'Sequence': 'GGTTAA', 'SequenceID': 's2',
                'QualID': None, 'Qual': None}]

        o = obs[0]
        e = exp[0]
        self.assertEqual(o['Sequence'], e['Sequence'])
        self.assertEqual(o['SequenceID'], e['SequenceID'])
        self.assertEqual(o['QualID'], e['QualID'])
        self.assertTrue((o['Qual'] == e['Qual']).all())

        o = obs[1]
        e = exp[1]
        self.assertEqual(o['Sequence'], e['Sequence'])
        self.assertEqual(o['SequenceID'], e['SequenceID'])
        self.assertEqual(o['QualID'], e['QualID'])
        self.assertTrue((o['Qual'] == e['Qual']).all())

        o = obs[2]
        e = exp[2]
        self.assertEqual(o['Sequence'], e['Sequence'])
        self.assertEqual(o['SequenceID'], e['SequenceID'])
        self.assertEqual(o['QualID'], e['QualID'])
        self.assertEqual(o['Qual'], e['Qual'])

        o = obs[3]
        e = exp[3]
        self.assertEqual(o['Sequence'], e['Sequence'])
        self.assertEqual(o['SequenceID'], e['SequenceID'])
        self.assertEqual(o['QualID'], e['QualID'])
        self.assertEqual(o['Qual'], e['Qual'])
Example #9
0
def _per_sample_lengths(fp):
    """Determine the lengths of all sequences per sample

    Parameters
    ----------
    fp : filepath
        The sequence file to walk over

    Returns
    -------
    dict
        {sample_id: [sequence_length]}
    """
    lengths = defaultdict(list)
    for record in load(fp):
        sample_id = record['SequenceID'].split('_', 1)[0]
        lengths[sample_id].append(len(record['Sequence']))

    return lengths
Example #10
0
def _per_sample_lengths(fp):
    """Determine the lengths of all sequences per sample

    Parameters
    ----------
    fp : filepath
        The sequence file to walk over

    Returns
    -------
    dict
        {sample_id: [sequence_length]}
    """
    lengths = defaultdict(list)
    for record in load(fp):
        sample_id = record['SequenceID'].split(' ')[0].rsplit('_', 1)[0]
        lengths[sample_id].append(len(record['Sequence']))

    return lengths
Example #11
0
    def test_multiple_files(self):
        """load should handle multiple files of different types"""
        it = load([self.fq1, self.fna1], phred_offset=64)
        obs = [rec.copy() for rec in it]
        exp = [{'Sequence': 'ATGC', 'SequenceID': 's1',
                'QualID': 's1', 'Qual': array([40, 40, 40, 40])},
               {'Sequence': 'AATTGG', 'SequenceID': 's2',
                'QualID': 's2', 'Qual': array([39, 39, 39, 39, 40, 40])},
               {'Sequence': 'ATGC', 'SequenceID': 's1',
                'QualID': None, 'Qual': None},
               {'Sequence': 'AATTGG', 'SequenceID': 's2',
                'QualID': None, 'Qual': None}]

        o = obs[0]
        e = exp[0]
        self.assertEqual(o['Sequence'], e['Sequence'])
        self.assertEqual(o['SequenceID'], e['SequenceID'])
        self.assertEqual(o['QualID'], e['QualID'])
        self.assertTrue((o['Qual'] == e['Qual']).all())

        o = obs[1]
        e = exp[1]
        self.assertEqual(o['Sequence'], e['Sequence'])
        self.assertEqual(o['SequenceID'], e['SequenceID'])
        self.assertEqual(o['QualID'], e['QualID'])
        self.assertTrue((o['Qual'] == e['Qual']).all())

        o = obs[2]
        e = exp[2]
        self.assertEqual(o['Sequence'], e['Sequence'])
        self.assertEqual(o['SequenceID'], e['SequenceID'])
        self.assertEqual(o['QualID'], e['QualID'])
        self.assertEqual(o['Qual'], e['Qual'])

        o = obs[3]
        e = exp[3]
        self.assertEqual(o['Sequence'], e['Sequence'])
        self.assertEqual(o['SequenceID'], e['SequenceID'])
        self.assertEqual(o['QualID'], e['QualID'])
        self.assertEqual(o['Qual'], e['Qual'])
Example #12
0
 def test_multiple_types_fasta_fastq_qual(self):
     with self.assertRaises(ValueError):
         load([self.fna1, self.fq1], qual=self.qual1)
Example #13
0
 def test_file_path_does_not_exist(self):
     with self.assertRaises(IOError):
         load('this-seqs-file-had-better-not-exist-or-this-test-will-'
              'fail.fna')
Example #14
0
 def test_unknown_filetype(self):
     with self.assertRaises(IOError):
         load('seqs.mpeg')
Example #15
0
 def test_no_seqs(self):
     for null in ('', [], (), None):
         with self.assertRaises(ValueError):
             load(null)
Example #16
0
def _validate_demux_file(qclient, job_id, prep_info, out_dir, demux_fp,
                         fastq_fp=None, fasta_fp=None, log_fp=None):
    """Validate and fix a 'demux' file and regenerate fastq and fasta files

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    prep_info : dict of {str: dict of {str: str}}
        The prep information keyed by sample id
    out_dir : str
        The output directory
    demux_fp : str
        The demux file path
    fastq_fp : str, optional
        The original fastq filepath. If demux is correct, it will not be
        regenerated
    fasta_fp : str, optional
        The original fasta filepath. If demux is correct, it will no be
        regenerated
    log_fp : str, optional
        The original log filepath

    Returns
    -------
    dict
        The results og the job
    """
    pt_sample_ids = set(prep_info)
    with File(demux_fp) as f:
        demux_sample_ids = set(f.keys())

    if not pt_sample_ids.issuperset(demux_sample_ids):
        # The demux sample ids are different from the ones in the prep template
        qclient.update_job_step(job_id, "Step 3: Fixing sample ids")
        # Atempt 1: the user provided the run prefix column - in this case the
        # run prefix column holds the sample ids present in the demux file
        if 'run_prefix' in prep_info[next(iter(pt_sample_ids))]:
            id_map = {v['run_prefix']: k for k, v in prep_info.items()}
            if not set(id_map).issuperset(demux_sample_ids):
                error_msg = ('The sample ids in the "run_prefix" columns '
                             'from the prep information do not match the '
                             'ones in the demux file. Please, correct the '
                             'column "run_prefix" in the prep information to '
                             'map the existing sample ids to the prep '
                             'information sample ids.')
                return False, None, error_msg
        else:
            # Attempt 2: the sample ids in the demux table are the same that
            # in the prep template but without the prefix
            prefix = next(iter(pt_sample_ids)).split('.', 1)[0]
            prefixed = set("%s.%s" % (prefix, s) for s in demux_sample_ids)
            if pt_sample_ids.issuperset(prefixed):
                id_map = {s: "%s.%s" % (prefix, s) for s in demux_sample_ids}
            else:
                # There is nothing we can do. The samples in the demux file do
                # not match the ones in the prep template and we can't fix it
                error_msg = ('The sample ids in the demultiplexed files do '
                             'not match the ones in the prep information. '
                             'Please, provide the column "run_prefix" in '
                             'the prep information to map the existing sample'
                             ' ids to the prep information sample ids.')
                return False, None, error_msg

        # Fix the sample ids
        # Do not modify the original demux file, copy it to a new location
        new_demux_fp = join(out_dir, basename(demux_fp))
        copy(demux_fp, new_demux_fp)
        # Need to catch an error
        with File(new_demux_fp, 'r+') as f:
            for old in f:
                f.move(old, id_map[old])

        # When we fix, we always generate the FASTQ and FASTA file
        # By setting them to None, below will be generated
        demux_fp = new_demux_fp
        fastq_fp = None
        fasta_fp = None

    # If we didn't fix anything, we only generate the files if they don't
    # already exists
    name = splitext(basename(demux_fp))[0]
    if not fastq_fp:
        fastq_fp = join(out_dir, "%s.fastq" % name)
        with open(fastq_fp, 'w') as fq:
            with File(demux_fp, 'r') as dx:
                for record in to_ascii(dx):
                    fq.write(record)

    if not fasta_fp:
        fasta_fp = join(out_dir, "%s.fasta" % name)
        with open(fasta_fp, 'w') as f:
            for r in load(fastq_fp):
                f.write(format_fasta_record(r['SequenceID'], r['Sequence'],
                                            r['Qual']))

    filepaths = [[[fastq_fp], 'preprocessed_fastq'],
                 [[fasta_fp], 'preprocessed_fasta'],
                 [[demux_fp], 'preprocessed_demux']]
    if log_fp:
        filepaths.append([[log_fp], 'log'])
    return True, [[None, 'Demultiplexed', filepaths]], ""
Example #17
0
 def test_multiple_types_fasta_fastq_qual(self):
     with self.assertRaises(ValueError):
         load([self.fna1, self.fq1], qual=self.qual1)
Example #18
0
 def test_file_path_does_not_exist(self):
     with self.assertRaises(IOError):
         load('this-seqs-file-had-better-not-exist-or-this-test-will-'
              'fail.fna')
Example #19
0
 def test_unknown_filetype(self):
     with self.assertRaises(IOError):
         load('seqs.mpeg')
Example #20
0
 def test_no_seqs(self):
     for null in ('', [], (), None):
         with self.assertRaises(ValueError):
             load(null)
Example #21
0
def _validate_demux_file(qclient, job_id, prep_info, out_dir, demux_fp,
                         fastq_fp=None, fasta_fp=None, log_fp=None):
    """Validate and fix a 'demux' file and regenerate fastq and fasta files

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    prep_info : dict of {str: dict of {str: str}}
        The prep information keyed by sample id
    out_dir : str
        The output directory
    demux_fp : str
        The demux file path
    fastq_fp : str, optional
        The original fastq filepath. If demux is correct, it will not be
        regenerated
    fasta_fp : str, optional
        The original fasta filepath. If demux is correct, it will no be
        regenerated
    log_fp : str, optional
        The original log filepath

    Returns
    -------
    dict
        The results og the job
    """
    pt_sample_ids = set(prep_info)
    with File(demux_fp) as f:
        demux_sample_ids = set(f.keys())

    if not pt_sample_ids.issuperset(demux_sample_ids):
        # The demux sample ids are different from the ones in the prep template
        qclient.update_job_step(job_id, "Step 3: Fixing sample ids")
        # Atempt 1: the user provided the run prefix column - in this case the
        # run prefix column holds the sample ids present in the demux file
        if 'run_prefix' in prep_info[next(iter(pt_sample_ids))]:
            id_map = {v['run_prefix']: k for k, v in prep_info.items()}
            if not set(id_map).issuperset(demux_sample_ids):
                return format_payload(
                    success=False,
                    error_msg='The sample ids in the "run_prefix" columns '
                              'from the prep information do not match the '
                              'ones in the demux file. Please, correct the '
                              'column "run_prefix" in the prep information to '
                              'map the existing sample ids to the prep '
                              'information sample ids.')
        else:
            # Attempt 2: the sample ids in the demux table are the same that
            # in the prep template but without the prefix
            prefix = next(iter(pt_sample_ids)).split('.', 1)[0]
            prefixed = set("%s.%s" % (prefix, s) for s in demux_sample_ids)
            if pt_sample_ids.issuperset(prefixed):
                id_map = {s: "%s.%s" % (prefix, s) for s in demux_sample_ids}
            else:
                # There is nothing we can do. The samples in the demux file do
                # not match the ones in the prep template and we can't fix it
                return format_payload(
                    success=False,
                    error_msg='The sample ids in the demultiplexed files do '
                              'not match the ones in the prep information. '
                              'Please, provide the column "run_prefix" in '
                              'the prep information to map the existing sample'
                              ' ids to the prep information sample ids.')
        # Fix the sample ids
        # Do not modify the original demux file, copy it to a new location
        new_demux_fp = join(out_dir, basename(demux_fp))
        copy(demux_fp, new_demux_fp)
        # Need to catch an error
        with File(new_demux_fp, 'r+') as f:
            for old in f:
                f.move(old, id_map[old])

        # When we fix, we always generate the FASTQ and FASTA file
        # By setting them to None, below will be generated
        demux_fp = new_demux_fp
        fastq_fp = None
        fasta_fp = None

    # If we didn't fix anything, we only generate the files if they don't
    # already exists
    name = splitext(basename(demux_fp))[0]
    if not fastq_fp:
        fastq_fp = join(out_dir, "%s.fastq" % name)
        with open(fastq_fp, 'w') as fq:
            with File(demux_fp, 'r') as dx:
                for record in to_ascii(dx):
                    fq.write(record)

    if not fasta_fp:
        fasta_fp = join(out_dir, "%s.fasta" % name)
        with open(fasta_fp, 'w') as f:
            for r in load(fastq_fp):
                f.write(format_fasta_record(r['SequenceID'], r['Sequence'],
                                            r['Qual']))

    filepaths = [[[fastq_fp], 'preprocessed_fastq'],
                 [[fasta_fp], 'preprocessed_fasta'],
                 [[demux_fp], 'preprocessed_demux']]
    if log_fp:
        filepaths.append([[log_fp], 'log'])
    return format_payload(
        success=True, artifacts_info=[[None, 'Demultiplexed', filepaths]])
Example #22
0
    def test_single_files(self):
        """load should handle a single file, and can be gzipped"""
        it = load(self.fna1)
        obs = [rec.copy() for rec in it]
        exp = [{
            'Sequence': 'ATGC',
            'SequenceID': 's1',
            'QualID': None,
            'Qual': None
        }, {
            'Sequence': 'AATTGG',
            'SequenceID': 's2',
            'QualID': None,
            'Qual': None
        }]
        self.assertEqual(obs, exp)
        it = load(self.fq1, phred_offset=64)
        obs = [rec.copy() for rec in it]
        exp = [{
            'Sequence': 'ATGC',
            'SequenceID': 's1',
            'QualID': 's1',
            'Qual': array([40, 40, 40, 40])
        }, {
            'Sequence': 'AATTGG',
            'SequenceID': 's2',
            'QualID': 's2',
            'Qual': array([39, 39, 39, 39, 40, 40])
        }]
        for o, e in zip(obs, exp):
            self.assertEqual(o['Sequence'], e['Sequence'])
            self.assertEqual(o['SequenceID'], e['SequenceID'])
            self.assertEqual(o['QualID'], e['QualID'])
            self.assertTrue((o['Qual'] == e['Qual']).all())

        it = load(self.qs1, phred_offset=64)
        obs = [rec.copy() for rec in it]
        exp = [{
            'Qual': array([2, 27, 27, 27]),
            'QualID': 'CRESSIA_242:1:2204:1453:1918#0/1',
            'Sequence': 'TTAA',
            'SequenceID': 'CRESSIA_242:1:2204:1453:1918#0/1'
        }, {
            'Qual': array([2, 2, 2, 2], ),
            'QualID': 'CRESSIA_242:1:2204:1491:1920#0/1',
            'Sequence': 'AAAA',
            'SequenceID': 'CRESSIA_242:1:2204:1491:1920#0/1'
        }]
        for o, e in zip(obs, exp):
            self.assertEqual(o['Sequence'], e['Sequence'])
            self.assertEqual(o['SequenceID'], e['SequenceID'])
            self.assertEqual(o['QualID'], e['QualID'])
            self.assertTrue((o['Qual'] == e['Qual']).all())

        it = load(self.fna1gz)
        obs = [rec.copy() for rec in it]
        exp = [{
            'Sequence': 'ATGC',
            'SequenceID': 's1',
            'QualID': None,
            'Qual': None
        }, {
            'Sequence': 'AATTGG',
            'SequenceID': 's2',
            'QualID': None,
            'Qual': None
        }]
        self.assertEqual(obs, exp)

        it = load(self.fq1gz, phred_offset=64)
        obs = [rec.copy() for rec in it]
        exp = [{
            'Sequence': 'ATGC',
            'SequenceID': 's1',
            'QualID': 's1',
            'Qual': array([40, 40, 40, 40])
        }, {
            'Sequence': 'AATTGG',
            'SequenceID': 's2',
            'QualID': 's2',
            'Qual': array([39, 39, 39, 39, 40, 40])
        }]
        for o, e in zip(obs, exp):
            self.assertEqual(o['Sequence'], e['Sequence'])
            self.assertEqual(o['SequenceID'], e['SequenceID'])
            self.assertEqual(o['QualID'], e['QualID'])
            self.assertTrue((o['Qual'] == e['Qual']).all())

        it = load(self.qs1gz, phred_offset=64)
        obs = [rec.copy() for rec in it]
        exp = [{
            'Qual': array([2, 27, 27, 27]),
            'QualID': 'CRESSIA_242:1:2204:1453:1918#0/1',
            'Sequence': 'TTAA',
            'SequenceID': 'CRESSIA_242:1:2204:1453:1918#0/1'
        }, {
            'Qual': array([2, 2, 2, 2], ),
            'QualID': 'CRESSIA_242:1:2204:1491:1920#0/1',
            'Sequence': 'AAAA',
            'SequenceID': 'CRESSIA_242:1:2204:1491:1920#0/1'
        }]
        for o, e in zip(obs, exp):
            self.assertEqual(o['Sequence'], e['Sequence'])
            self.assertEqual(o['SequenceID'], e['SequenceID'])
            self.assertEqual(o['QualID'], e['QualID'])
            self.assertTrue((o['Qual'] == e['Qual']).all())
Example #23
0
def _has_qual(fp):
    """Check if it looks like we have qual"""
    iter_ = load(fp)
    rec = next(iter(iter_))
    return rec['Qual'] is not None
Example #24
0
def _has_qual(fp):
    """Check if it looks like we have qual"""
    iter_ = load(fp)
    rec = next(iter(iter_))
    return rec['Qual'] is not None
Example #25
0
    def test_single_files(self):
        """load should handle a single file, and can be gzipped"""
        it = load(self.fna1)
        obs = [rec.copy() for rec in it]
        exp = [{
            'Sequence': 'ATGC',
            'SequenceID': 's1',
            'QualID': None,
            'Qual': None
        }, {
            'Sequence': 'AATTGG',
            'SequenceID': 's2',
            'QualID': None,
            'Qual': None
        }]
        self.assertEqual(obs, exp)
        it = load(self.fq1, phred_offset=64)
        obs = [rec.copy() for rec in it]
        exp = [{
            'Sequence': 'ATGC',
            'SequenceID': 's1',
            'QualID': 's1',
            'Qual': array([40, 40, 40, 40])
        }, {
            'Sequence': 'AATTGG',
            'SequenceID': 's2',
            'QualID': 's2',
            'Qual': array([39, 39, 39, 39, 40, 40])
        }]
        for o, e in zip(obs, exp):
            self.assertEqual(o['Sequence'], e['Sequence'])
            self.assertEqual(o['SequenceID'], e['SequenceID'])
            self.assertEqual(o['QualID'], e['QualID'])
            self.assertTrue((o['Qual'] == e['Qual']).all())

        it = load(self.fna1gz)
        obs = [rec.copy() for rec in it]
        exp = [{
            'Sequence': 'ATGC',
            'SequenceID': 's1',
            'QualID': None,
            'Qual': None
        }, {
            'Sequence': 'AATTGG',
            'SequenceID': 's2',
            'QualID': None,
            'Qual': None
        }]
        self.assertEqual(obs, exp)

        it = load(self.fq1gz, phred_offset=64)
        obs = [rec.copy() for rec in it]
        exp = [{
            'Sequence': 'ATGC',
            'SequenceID': 's1',
            'QualID': 's1',
            'Qual': array([40, 40, 40, 40])
        }, {
            'Sequence': 'AATTGG',
            'SequenceID': 's2',
            'QualID': 's2',
            'Qual': array([39, 39, 39, 39, 40, 40])
        }]
        for o, e in zip(obs, exp):
            self.assertEqual(o['Sequence'], e['Sequence'])
            self.assertEqual(o['SequenceID'], e['SequenceID'])
            self.assertEqual(o['QualID'], e['QualID'])
            self.assertTrue((o['Qual'] == e['Qual']).all())
    def test_single_files(self):
        """load should handle a single file, and can be gzipped"""
        it = load(self.fna1)
        obs = [rec.copy() for rec in it]
        exp = [{'Sequence': 'ATGC', 'SequenceID': 's1',
                'QualID': None, 'Qual': None},
               {'Sequence': 'AATTGG', 'SequenceID': 's2',
                'QualID': None, 'Qual': None}]
        self.assertEqual(obs, exp)
        it = load(self.fq1, phred_offset=64)
        obs = [rec.copy() for rec in it]
        exp = [{'Sequence': 'ATGC', 'SequenceID': 's1',
                'QualID': 's1', 'Qual': array([40, 40, 40, 40])},
               {'Sequence': 'AATTGG', 'SequenceID': 's2',
                'QualID': 's2', 'Qual': array([39, 39, 39, 39, 40, 40])}]
        for o, e in zip(obs, exp):
            self.assertEqual(o['Sequence'], e['Sequence'])
            self.assertEqual(o['SequenceID'], e['SequenceID'])
            self.assertEqual(o['QualID'], e['QualID'])
            self.assertTrue((o['Qual'] == e['Qual']).all())

        it = load(self.qs1, phred_offset=64)
        obs = [rec.copy() for rec in it]
        exp = [{'Qual': array([2, 27, 27, 27]),
                'QualID': 'CRESSIA_242:1:2204:1453:1918#0/1',
                'Sequence': 'TTAA',
                'SequenceID': 'CRESSIA_242:1:2204:1453:1918#0/1'},
               {'Qual': array([2, 2, 2, 2],),
                'QualID': 'CRESSIA_242:1:2204:1491:1920#0/1',
                'Sequence': 'AAAA',
                'SequenceID': 'CRESSIA_242:1:2204:1491:1920#0/1'}]
        for o, e in zip(obs, exp):
            self.assertEqual(o['Sequence'], e['Sequence'])
            self.assertEqual(o['SequenceID'], e['SequenceID'])
            self.assertEqual(o['QualID'], e['QualID'])
            self.assertTrue((o['Qual'] == e['Qual']).all())

        it = load(self.fna1gz)
        obs = [rec.copy() for rec in it]
        exp = [{'Sequence': 'ATGC', 'SequenceID': 's1',
                'QualID': None, 'Qual': None},
               {'Sequence': 'AATTGG', 'SequenceID': 's2',
                'QualID': None, 'Qual': None}]
        self.assertEqual(obs, exp)

        it = load(self.fq1gz, phred_offset=64)
        obs = [rec.copy() for rec in it]
        exp = [{'Sequence': 'ATGC', 'SequenceID': 's1',
                'QualID': 's1', 'Qual': array([40, 40, 40, 40])},
               {'Sequence': 'AATTGG', 'SequenceID': 's2',
                'QualID': 's2', 'Qual': array([39, 39, 39, 39, 40, 40])}]
        for o, e in zip(obs, exp):
            self.assertEqual(o['Sequence'], e['Sequence'])
            self.assertEqual(o['SequenceID'], e['SequenceID'])
            self.assertEqual(o['QualID'], e['QualID'])
            self.assertTrue((o['Qual'] == e['Qual']).all())

        it = load(self.qs1gz, phred_offset=64)
        obs = [rec.copy() for rec in it]
        exp = [{'Qual': array([2, 27, 27, 27]),
                'QualID': 'CRESSIA_242:1:2204:1453:1918#0/1',
                'Sequence': 'TTAA',
                'SequenceID': 'CRESSIA_242:1:2204:1453:1918#0/1'},
               {'Qual': array([2, 2, 2, 2],),
                'QualID': 'CRESSIA_242:1:2204:1491:1920#0/1',
                'Sequence': 'AAAA',
                'SequenceID': 'CRESSIA_242:1:2204:1491:1920#0/1'}]
        for o, e in zip(obs, exp):
            self.assertEqual(o['Sequence'], e['Sequence'])
            self.assertEqual(o['SequenceID'], e['SequenceID'])
            self.assertEqual(o['QualID'], e['QualID'])
            self.assertTrue((o['Qual'] == e['Qual']).all())