Example #1
0
    def _generate_demultiplexed_fastq_demux(self, mtime):
        """Modularity helper"""
        # An artifact will hold only one file of type
        # `preprocessed_demux`. Thus, we only use the first one
        # (the only one present)
        ar = self.artifact
        demux = [
            path for _, path, ftype in ar.filepaths
            if ftype == 'preprocessed_demux'
        ][0]

        demux_samples = set()
        with open_file(demux) as demux_fh:
            if not isinstance(demux_fh, File):
                error_msg = ("'%s' doesn't look like a demux file" % demux)
                LogEntry.create('Runtime', error_msg)
                raise EBISubmissionError(error_msg)
            for s, i in to_per_sample_ascii(demux_fh,
                                            self.prep_template.keys()):
                sample_fp = self.sample_demux_fps[s]
                wrote_sequences = False
                with GzipFile(sample_fp, mode='w', mtime=mtime) as fh:
                    for record in i:
                        fh.write(record)
                        wrote_sequences = True

                if wrote_sequences:
                    demux_samples.add(s)
                else:
                    del (self.samples[s])
                    del (self.samples_prep[s])
                    del (self.sample_demux_fps[s])
                    remove(sample_fp)
        return demux_samples
Example #2
0
    def _generate_demultiplexed_fastq_demux(self, mtime):
        """Modularity helper"""
        # An artifact will hold only one file of type
        # `preprocessed_demux`. Thus, we only use the first one
        # (the only one present)
        ar = self.artifact
        demux = [path for _, path, ftype in ar.filepaths
                 if ftype == 'preprocessed_demux'][0]

        demux_samples = set()
        with open_file(demux) as demux_fh:
            if not isinstance(demux_fh, File):
                error_msg = (
                    "'%s' doesn't look like a demux file" % demux)
                LogEntry.create('Runtime', error_msg)
                raise EBISubmissionError(error_msg)
            for s, i in to_per_sample_ascii(demux_fh,
                                            self.prep_template.keys()):
                sample_fp = self.sample_demux_fps[s]
                wrote_sequences = False
                with GzipFile(sample_fp, mode='w', mtime=mtime) as fh:
                    for record in i:
                        fh.write(record)
                        wrote_sequences = True

                if wrote_sequences:
                    demux_samples.add(s)
                else:
                    del(self.samples[s])
                    del(self.samples_prep[s])
                    del(self.sample_demux_fps[s])
                    remove(sample_fp)
        return demux_samples
Example #3
0
    def test_to_per_sample_ascii(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as f:
            f.write(fqdata)

        self.to_remove.append(f.name)
        to_hdf5(f.name, self.hdf5_file)

        exp = [(b'a', [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n"
                        b"ABC\n")]),
               (b'b', [(b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n"
                        b"DFG\n"),
                       (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n"
                        b"DEF\n")])]

        obs = [(s[0], list(s[1])) for s in to_per_sample_ascii(self.hdf5_file)]
        self.assertEqual(obs, exp)
Example #4
0
    def test_fetch_qual_length_bug(self):
        # fetch was not trimming qual to the length of the sequence resulting
        # in qual scores for positions beyond the length of the sequence.
        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as f:
            f.write(fqdata_variable_length)

        self.to_remove.append(f.name)
        to_hdf5(f.name, self.hdf5_file)

        exp = [(b'a', [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n"
                        b"ABC\n")]),
               (b'b', [(b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n"
                        b"DFG\n"),
                       (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n+\n"
                        b"DEF#G\n")])]

        obs = [(s[0], list(s[1])) for s in to_per_sample_ascii(self.hdf5_file)]
        self.assertEqual(obs, exp)
Example #5
0
    def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None):
        """Generates demultiplexed fastq

        Parameters
        ----------
        rewrite_fastq : bool, optional
            If true, it forces the rewrite of the fastq files
        mtime : float, optional
            The time to use when creating the gz files. If None, the current
            time will be used by gzip.GzipFile. This is useful for testing.

        Returns
        -------
        demux_samples
            List of successful demultiplexed samples

        Notes
        -----
        - As a performace feature, this method will check if self.full_ebi_dir
        already exists and, if it does, the script will assume that in a
        previous execution this step was performed correctly and will simply
        read the file names from self.full_ebi_dir
        - When the object is created (init), samples, samples_prep and
        sample_demux_fps hold values for all available samples in the database.
        Here some of those values will be deleted (del's, within the loops) for
        those cases where the fastq.gz files weren't written or exist. This is
        an indication that they had no sequences and this kind of files are not
        accepted in EBI

        Raises
        ------
        EBISubmissionError
            - The demux file couldn't be read
            - All samples are removed
        """
        ar = self.artifact

        dir_not_exists = not isdir(self.full_ebi_dir)
        if dir_not_exists or rewrite_fastq:
            makedirs(self.full_ebi_dir)

            # An artifact will hold only one file of type `preprocessed_demux`
            # Thus, we only use the first one (the only one present)
            demux = [
                path for _, path, ftype in ar.filepaths
                if ftype == 'preprocessed_demux'
            ][0]

            demux_samples = set()
            with open_file(demux) as demux_fh:
                if not isinstance(demux_fh, File):
                    error_msg = "'%s' doesn't look like a demux file" % demux
                    LogEntry.create('Runtime', error_msg)
                    raise EBISubmissionError(error_msg)
                for s, i in to_per_sample_ascii(demux_fh,
                                                self.prep_template.keys()):
                    sample_fp = self.sample_demux_fps[s]
                    wrote_sequences = False
                    with GzipFile(sample_fp, mode='w', mtime=mtime) as fh:
                        for record in i:
                            fh.write(record)
                            wrote_sequences = True

                    if wrote_sequences:
                        demux_samples.add(s)
                    else:
                        del (self.samples[s])
                        del (self.samples_prep[s])
                        del (self.sample_demux_fps[s])
                        remove(sample_fp)
        else:
            demux_samples = set()
            extension = '.fastq.gz'
            extension_len = len(extension)
            for f in listdir(self.full_ebi_dir):
                fpath = join(self.full_ebi_dir, f)
                if isfile(fpath) and f.endswith(extension):
                    demux_samples.add(f[:-extension_len])

            missing_samples = set(self.samples.keys()).difference(
                set(demux_samples))
            for ms in missing_samples:
                del (self.samples[ms])
                del (self.samples_prep[ms])
                del (self.sample_demux_fps[ms])

        if not demux_samples:
            error_msg = ("All samples were removed from the submission "
                         "because the demux file is empty or the sample names "
                         "do not match.")
            LogEntry.create('Runtime', error_msg)
            raise EBISubmissionError(error_msg)
        return demux_samples
Example #6
0
    def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None):
        """Generates demultiplexed fastq

        Parameters
        ----------
        rewrite_fastq : bool, optional
            If true, it forces the rewrite of the fastq files
        mtime : float, optional
            The time to use when creating the gz files. If None, the current
            time will be used by gzip.GzipFile. This is useful for testing.

        Returns
        -------
        demux_samples
            List of successful demultiplexed samples

        Notes
        -----
        - As a performace feature, this method will check if self.full_ebi_dir
        already exists and, if it does, the script will assume that in a
        previous execution this step was performed correctly and will simply
        read the file names from self.full_ebi_dir
        - When the object is created (init), samples, samples_prep and
        sample_demux_fps hold values for all available samples in the database.
        Here some of those values will be deleted (del's, within the loops) for
        those cases where the fastq.gz files weren't written or exist. This is
        an indication that they had no sequences and this kind of files are not
        accepted in EBI

        Raises
        ------
        EBISubmissionError
            - The demux file couldn't be read
            - All samples are removed
        """
        ar = self.artifact

        dir_not_exists = not isdir(self.full_ebi_dir)
        if dir_not_exists or rewrite_fastq:
            makedirs(self.full_ebi_dir)

            # An artifact will hold only one file of type `preprocessed_demux`
            # Thus, we only use the first one (the only one present)
            demux = [path for _, path, ftype in ar.filepaths
                     if ftype == 'preprocessed_demux'][0]

            demux_samples = set()
            with open_file(demux) as demux_fh:
                if not isinstance(demux_fh, File):
                    error_msg = "'%s' doesn't look like a demux file" % demux
                    LogEntry.create('Runtime', error_msg)
                    raise EBISubmissionError(error_msg)
                for s, i in to_per_sample_ascii(demux_fh,
                                                self.prep_template.keys()):
                    sample_fp = self.sample_demux_fps[s]
                    wrote_sequences = False
                    with GzipFile(sample_fp, mode='w', mtime=mtime) as fh:
                        for record in i:
                            fh.write(record)
                            wrote_sequences = True

                    if wrote_sequences:
                        demux_samples.add(s)
                    else:
                        del(self.samples[s])
                        del(self.samples_prep[s])
                        del(self.sample_demux_fps[s])
                        remove(sample_fp)
        else:
            demux_samples = set()
            extension = '.fastq.gz'
            extension_len = len(extension)
            for f in listdir(self.full_ebi_dir):
                fpath = join(self.full_ebi_dir, f)
                if isfile(fpath) and f.endswith(extension):
                    demux_samples.add(f[:-extension_len])

            missing_samples = set(self.samples.keys()).difference(
                set(demux_samples))
            for ms in missing_samples:
                del(self.samples[ms])
                del(self.samples_prep[ms])
                del(self.sample_demux_fps[ms])

        if not demux_samples:
            error_msg = ("All samples were removed from the submission "
                         "because the demux file is empty or the sample names "
                         "do not match.")
            LogEntry.create('Runtime', error_msg)
            raise EBISubmissionError(error_msg)
        return demux_samples