コード例 #1
0
ファイル: test_demux.py プロジェクト: mortonjt/qiita
    def test_to_per_sample_ascii(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as f:
            f.write(fqdata)

        self.to_remove.append(f.name)
        to_hdf5(f.name, self.hdf5_file)

        exp = [('a', [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n"
                       "ABC\n")]),
               ('b', [(b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n"
                       "DFG\n"),
                      (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n"
                       "DEF\n")])]

        obs = [(s[0], list(s[1])) for s in to_per_sample_ascii(self.hdf5_file)]
        self.assertEqual(obs, exp)
コード例 #2
0
ファイル: test_demux.py プロジェクト: qiyunzhu/qiita
    def test_to_per_sample_ascii(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as f:
            f.write(fqdata)

        self.to_remove.append(f.name)
        to_hdf5(f.name, self.hdf5_file)

        exp = [('a', [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n"
                       "ABC\n")]),
               ('b', [(b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n"
                       "DFG\n"),
                      (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n"
                       "DEF\n")])]

        obs = [(s[0], list(s[1])) for s in to_per_sample_ascii(self.hdf5_file)]
        self.assertEqual(obs, exp)
コード例 #3
0
ファイル: test_demux.py プロジェクト: qiyunzhu/qiita
    def test_fetch_qual_length_bug(self):
        # fetch was not trimming qual to the length of the sequence resulting
        # in qual scores for positions beyond the length of the sequence.
        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as f:
            f.write(fqdata_variable_length)

        self.to_remove.append(f.name)
        to_hdf5(f.name, self.hdf5_file)

        exp = [('a', [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n"
                       "ABC\n")]),
               ('b', [(b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n"
                       "DFG\n"),
                      (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n+\n"
                       "DEF#G\n")])]

        obs = [(s[0], list(s[1])) for s in to_per_sample_ascii(self.hdf5_file)]
        self.assertEqual(obs, exp)
コード例 #4
0
ファイル: test_demux.py プロジェクト: adamrp/qiita
    def test_fetch_qual_length_bug(self):
        # fetch was not trimming qual to the length of the sequence resulting
        # in qual scores for positions beyond the length of the sequence.
        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as f:
            f.write(fqdata_variable_length)

        self.to_remove.append(f.name)
        to_hdf5(f.name, self.hdf5_file)

        exp = [('a', [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n"
                       "ABC\n")]),
               ('b', [(b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n"
                       "DFG\n"),
                      (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n+\n"
                       "DEF#G\n")])]

        obs = [(s[0], list(s[1])) for s in to_per_sample_ascii(self.hdf5_file)]
        self.assertEqual(obs, exp)
コード例 #5
0
ファイル: commands.py プロジェクト: MarkBruns/qiita
def submit_EBI(preprocessed_data_id, action, send, fastq_dir_fp=None):
    """Submit a preprocessed data to EBI

    Parameters
    ----------
    preprocessed_data_id : int
        The preprocesssed data id
    action : %s
        The action to perform with this data
    send : bool
        True to actually send the files
    fastq_dir_fp : str, optional
        The fastq filepath

    Notes
    -----
    If fastq_dir_fp is passed, it must not contain any empty files, or
    gzipped empty files
    """
    preprocessed_data = PreprocessedData(preprocessed_data_id)
    preprocessed_data_id_str = str(preprocessed_data_id)
    study = Study(preprocessed_data.study)
    sample_template = SampleTemplate(study.sample_template)
    prep_template = PrepTemplate(preprocessed_data.prep_template)

    investigation_type = None
    new_investigation_type = None

    status = preprocessed_data.submitted_to_insdc_status()
    if status in ("submitting", "success"):
        raise ValueError("Cannot resubmit! Current status is: %s" % status)

    if send:
        # If we intend actually to send the files, then change the status in
        # the database
        preprocessed_data.update_insdc_status("submitting")

    # we need to figure out whether the investigation type is a known one
    # or if we have to submit a "new_investigation_type" to EBI
    current_type = prep_template.investigation_type
    ena_ontology = Ontology(convert_to_id("ENA", "ontology"))
    if current_type in ena_ontology.terms:
        investigation_type = current_type
    elif current_type in ena_ontology.user_defined_terms:
        investigation_type = "Other"
        new_investigation_type = current_type
    else:
        # This should never happen
        raise ValueError(
            "Unrecognized investigation type: '%s'. This term "
            "is neither one of the official terms nor one of the "
            "user-defined terms in the ENA ontology"
        )

    if fastq_dir_fp is not None:
        # If the user specifies a FASTQ directory, use it

        # Set demux_samples to None so that MetadataTemplate.to_file will put
        # all samples in the template files
        demux_samples = None
    else:
        # If the user does not specify a FASTQ directory, create one and
        # re-serialize the per-sample FASTQs from the demux file
        fastq_dir_fp = mkdtemp(prefix=qiita_config.working_dir)
        demux = [path for _, path, ftype in preprocessed_data.get_filepaths() if ftype == "preprocessed_demux"][0]

        # Keep track of which files were actually in the demux file so that we
        # can write those rows to the prep and samples templates
        demux_samples = set()

        with open_file(demux) as demux_fh:
            for samp, iterator in to_per_sample_ascii(demux_fh, list(sample_template)):
                demux_samples.add(samp)
                sample_fp = join(fastq_dir_fp, "%s.fastq.gz" % samp)
                wrote_sequences = False
                with gzopen(sample_fp, "w") as fh:
                    for record in iterator:
                        fh.write(record)
                        wrote_sequences = True

                if not wrote_sequences:
                    remove(sample_fp)

    output_dir = fastq_dir_fp + "_submission"

    samp_fp = join(fastq_dir_fp, "sample_metadata.txt")
    prep_fp = join(fastq_dir_fp, "prep_metadata.txt")

    sample_template.to_file(samp_fp, demux_samples)
    prep_template.to_file(prep_fp, demux_samples)

    # Get specific output directory and set filepaths
    get_output_fp = partial(join, output_dir)
    study_fp = get_output_fp("study.xml")
    sample_fp = get_output_fp("sample.xml")
    experiment_fp = get_output_fp("experiment.xml")
    run_fp = get_output_fp("run.xml")
    submission_fp = get_output_fp("submission.xml")

    if not isdir(output_dir):
        makedirs(output_dir)
    else:
        raise IOError("The output folder already exists: %s" % output_dir)

    with open(samp_fp, "U") as st, open(prep_fp, "U") as pt:
        submission = EBISubmission.from_templates_and_per_sample_fastqs(
            preprocessed_data_id_str,
            study.title,
            study.info["study_abstract"],
            investigation_type,
            st,
            pt,
            fastq_dir_fp,
            new_investigation_type=new_investigation_type,
            pmids=study.pmids,
        )

    submission.write_all_xml_files(study_fp, sample_fp, experiment_fp, run_fp, submission_fp, action)

    if send:
        submission.send_sequences()
        study_accession, submission_accession = submission.send_xml()

        if study_accession is None or submission_accession is None:
            preprocessed_data.update_insdc_status("failed")

            raise ComputeError("EBI Submission failed!")
        else:
            preprocessed_data.update_insdc_status("success", study_accession, submission_accession)
    else:
        study_accession, submission_accession = None, None

    return study_accession, submission_accession
コード例 #6
0
def submit_EBI(preprocessed_data_id, action, send, fastq_dir_fp=None):
    """Submit a preprocessed data to EBI

    Parameters
    ----------
    preprocessed_data_id : int
        The preprocesssed data id
    action : %s
        The action to perform with this data
    send : bool
        True to actually send the files
    fastq_dir_fp : str, optional
        The fastq filepath
    """
    preprocessed_data = PreprocessedData(preprocessed_data_id)
    preprocessed_data_id_str = str(preprocessed_data_id)
    study = Study(preprocessed_data.study)
    sample_template = SampleTemplate(study.sample_template)
    prep_template = PrepTemplate(preprocessed_data.prep_template)

    investigation_type = None
    new_investigation_type = None

    status = preprocessed_data.submitted_to_insdc_status()
    if status in ('submitting', 'success'):
        raise ValueError("Cannot resubmit! Current status is: %s" % status)

    if send:
        # If we intend actually to send the files, then change the status in
        # the database
        preprocessed_data.update_insdc_status('submitting')

    # we need to figure out whether the investigation type is a known one
    # or if we have to submit a "new_investigation_type" to EBI
    current_type = prep_template.investigation_type
    ena_ontology = Ontology(convert_to_id('ENA', 'ontology'))
    if current_type in ena_ontology.terms:
        investigation_type = current_type
    elif current_type in ena_ontology.user_defined_terms:
        investigation_type = 'Other'
        new_investigation_type = current_type
    else:
        # This should never happen
        raise ValueError("Unrecognized investigation type: '%s'. This term "
                         "is neither one of the official terms nor one of the "
                         "user-defined terms in the ENA ontology")

    if fastq_dir_fp is not None:
        # If the user specifies a FASTQ directory, use it

        # Set demux_samples to None so that MetadataTemplate.to_file will put
        # all samples in the template files
        demux_samples = None
    else:
        # If the user does not specify a FASTQ directory, create one and
        # re-serialize the per-sample FASTQs from the demux file
        fastq_dir_fp = mkdtemp(prefix=qiita_config.working_dir)
        demux = [
            path for _, path, ftype in preprocessed_data.get_filepaths()
            if ftype == 'preprocessed_demux'
        ][0]

        # Keep track of which files were actually in the demux file so that we
        # can write those rows to the prep and samples templates
        demux_samples = set()

        with open_file(demux) as demux_fh:
            for samp, iterator in to_per_sample_ascii(demux_fh,
                                                      list(sample_template)):
                demux_samples.add(samp)
                sample_fp = join(fastq_dir_fp, "%s.fastq.gz" % samp)
                with gzopen(sample_fp, 'w') as fh:
                    for record in iterator:
                        fh.write(record)

    output_dir = fastq_dir_fp + '_submission'

    samp_fp = join(fastq_dir_fp, 'sample_metadata.txt')
    prep_fp = join(fastq_dir_fp, 'prep_metadata.txt')

    sample_template.to_file(samp_fp, demux_samples)
    prep_template.to_file(prep_fp, demux_samples)

    # Get specific output directory and set filepaths
    get_output_fp = partial(join, output_dir)
    study_fp = get_output_fp('study.xml')
    sample_fp = get_output_fp('sample.xml')
    experiment_fp = get_output_fp('experiment.xml')
    run_fp = get_output_fp('run.xml')
    submission_fp = get_output_fp('submission.xml')

    if not isdir(output_dir):
        makedirs(output_dir)
    else:
        raise IOError('The output folder already exists: %s' % output_dir)

    with open(samp_fp, 'U') as st, open(prep_fp, 'U') as pt:
        submission = EBISubmission.from_templates_and_per_sample_fastqs(
            preprocessed_data_id_str,
            study.title,
            study.info['study_abstract'],
            investigation_type,
            st,
            pt,
            fastq_dir_fp,
            new_investigation_type=new_investigation_type,
            pmids=study.pmids)

    submission.write_all_xml_files(study_fp, sample_fp, experiment_fp, run_fp,
                                   submission_fp, action)

    if send:
        submission.send_sequences()
        study_accession, submission_accession = submission.send_xml()

        if study_accession is None or submission_accession is None:
            preprocessed_data.update_insdc_status('failed')

            raise ComputeError("EBI Submission failed!")
        else:
            preprocessed_data.update_insdc_status('success', study_accession,
                                                  submission_accession)
    else:
        study_accession, submission_accession = None, None

    return study_accession, submission_accession
コード例 #7
0
    def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None):
        """Generates demultiplexed fastq

        Parameters
        ----------
        rewrite_fastq : bool, optional
            If true, it forces the rewrite of the fastq files
        mtime : float, optional
            The time to use when creating the gz files. If None, the current
            time will be used by gzip.GzipFile. This is useful for testing.

        Returns
        -------
        demux_samples
            List of successful demultiplexed samples

        Notes
        -----
        - As a performace feature, this method will check if self.full_ebi_dir
        already exists and, if it does, the script will assume that in a
        previous execution this step was performed correctly and will simply
        read the file names from self.full_ebi_dir
        - When the object is created (init), samples, samples_prep and
        sample_demux_fps hold values for all available samples in the database.
        Here some of those values will be deleted (del's, within the loops) for
        those cases where the fastq.gz files weren't written or exist. This is
        an indication that they had no sequences and this kind of files are not
        accepted in EBI

        Raises
        ------
        EBISubmissionError
            - The demux file couldn't be read
            - All samples are removed
        """
        ar = self.artifact

        dir_not_exists = not isdir(self.full_ebi_dir)
        if dir_not_exists or rewrite_fastq:
            makedirs(self.full_ebi_dir)

            # An artifact will hold only one file of type `preprocessed_demux`
            # Thus, we only use the first one (the only one present)
            demux = [
                path for _, path, ftype in ar.filepaths
                if ftype == 'preprocessed_demux'
            ][0]

            demux_samples = set()
            with open_file(demux) as demux_fh:
                if not isinstance(demux_fh, File):
                    error_msg = "'%s' doesn't look like a demux file" % demux
                    LogEntry.create('Runtime', error_msg)
                    raise EBISubmissionError(error_msg)
                for s, i in to_per_sample_ascii(demux_fh,
                                                self.prep_template.keys()):
                    sample_fp = self.sample_demux_fps[s]
                    wrote_sequences = False
                    with GzipFile(sample_fp, mode='w', mtime=mtime) as fh:
                        for record in i:
                            fh.write(record)
                            wrote_sequences = True

                    if wrote_sequences:
                        demux_samples.add(s)
                    else:
                        del (self.samples[s])
                        del (self.samples_prep[s])
                        del (self.sample_demux_fps[s])
                        remove(sample_fp)
        else:
            demux_samples = set()
            extension = '.fastq.gz'
            extension_len = len(extension)
            for f in listdir(self.full_ebi_dir):
                fpath = join(self.full_ebi_dir, f)
                if isfile(fpath) and f.endswith(extension):
                    demux_samples.add(f[:-extension_len])

            missing_samples = set(self.samples.keys()).difference(
                set(demux_samples))
            for ms in missing_samples:
                del (self.samples[ms])
                del (self.samples_prep[ms])
                del (self.sample_demux_fps[ms])

        if not demux_samples:
            error_msg = ("All samples were removed from the submission "
                         "because the demux file is empty or the sample names "
                         "do not match.")
            LogEntry.create('Runtime', error_msg)
            raise EBISubmissionError(error_msg)
        return demux_samples
コード例 #8
0
ファイル: ebi.py プロジェクト: elisemorrison/qiita
    def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None):
        """Generates demultiplexed fastq

        Parameters
        ----------
        rewrite_fastq : bool, optional
            If true, it forces the rewrite of the fastq files
        mtime : float, optional
            The time to use when creating the gz files. If None, the current
            time will be used by gzip.GzipFile. This is useful for testing.

        Returns
        -------
        demux_samples
            List of successful demultiplexed samples

        Notes
        -----
        - As a performace feature, this method will check if self.full_ebi_dir
        already exists and, if it does, the script will assume that in a
        previous execution this step was performed correctly and will simply
        read the file names from self.full_ebi_dir
        - When the object is created (init), samples, samples_prep and
        sample_demux_fps hold values for all available samples in the database.
        Here some of those values will be deleted (del's, within the loops) for
        those cases where the fastq.gz files weren't written or exist. This is
        an indication that they had no sequences and this kind of files are not
        accepted in EBI

        Raises
        ------
        EBISubmissionError
            - The demux file couldn't be read
            - All samples are removed
        """
        ppd = self.preprocessed_data

        dir_not_exists = not isdir(self.full_ebi_dir)
        if dir_not_exists or rewrite_fastq:
            makedirs(self.full_ebi_dir)

            demux = [path for _, path, ftype in ppd.get_filepaths()
                     if ftype == 'preprocessed_demux'][0]

            demux_samples = set()
            with open_file(demux) as demux_fh:
                if not isinstance(demux_fh, File):
                    error_msg = "'%s' doesn't look like a demux file" % demux
                    LogEntry.create('Runtime', error_msg)
                    raise EBISubmissionError(error_msg)
                for s, i in to_per_sample_ascii(demux_fh,
                                                self.prep_template.keys()):
                    sample_fp = self.sample_demux_fps[s]
                    wrote_sequences = False
                    with GzipFile(sample_fp, mode='w', mtime=mtime) as fh:
                        for record in i:
                            fh.write(record)
                            wrote_sequences = True

                    if wrote_sequences:
                        demux_samples.add(s)
                    else:
                        del(self.samples[s])
                        del(self.samples_prep[s])
                        del(self.sample_demux_fps[s])
                        remove(sample_fp)
        else:
            demux_samples = set()
            extension = '.fastq.gz'
            extension_len = len(extension)
            for f in listdir(self.full_ebi_dir):
                fpath = join(self.full_ebi_dir, f)
                if isfile(fpath) and f.endswith(extension):
                    demux_samples.add(f[:-extension_len])

            missing_samples = set(self.samples.keys()).difference(
                set(demux_samples))
            for ms in missing_samples:
                del(self.samples[ms])
                del(self.samples_prep[ms])
                del(self.sample_demux_fps[ms])

        if not demux_samples:
            error_msg = ("All samples were removed from the submission "
                         "because the demux file is empty or the sample names "
                         "do not match.")
            LogEntry.create('Runtime', error_msg)
            raise EBISubmissionError(error_msg)
        return demux_samples