Ejemplo n.º 1
0
Archivo: util.py Proyecto: yimsea/qiita
def generate_demux_file(sl_out):
    """Creates the HDF5 demultiplexed file

    Parameters
    ----------
    sl_out : str
        Path to the output directory of split libraries

    Returns
    -------
    str
        The path of the demux file

    Raises
    ------
    ValueError
        If the split libraries output does not contain the demultiplexed fastq
        file
    """
    fastq_fp = str(join(sl_out, 'seqs.fastq'))
    if not exists(fastq_fp):
        raise ValueError("The split libraries output directory does not "
                         "contain the demultiplexed fastq file.")

    demux_fp = join(sl_out, 'seqs.demux')
    with File(demux_fp, "w") as f:
        to_hdf5(fastq_fp, f)
    return demux_fp
Ejemplo n.º 2
0
    def write_demux_files(self, prep_template, generate_hdf5=True):
        """Writes a demux test file to avoid duplication of code"""
        fna_fp = join(self.temp_dir, 'seqs.fna')
        demux_fp = join(self.temp_dir, 'demux.seqs')
        if generate_hdf5:
            with open(fna_fp, 'w') as f:
                f.write(FASTA_EXAMPLE)
            with File(demux_fp, "w") as f:
                to_hdf5(fna_fp, f)
        else:
            with open(demux_fp, 'w') as f:
                f.write('')

        if prep_template.artifact is None:
            ppd = Artifact.create(
                [(demux_fp, 6)], "Demultiplexed", prep_template=prep_template,
                can_be_submitted_to_ebi=True, can_be_submitted_to_vamps=True)
        else:
            params = Parameters.from_default_params(
                DefaultParameters(1),
                {'input_data': prep_template.artifact.id})
            ppd = Artifact.create(
                [(demux_fp, 6)], "Demultiplexed",
                parents=[prep_template.artifact], processing_parameters=params,
                can_be_submitted_to_ebi=True, can_be_submitted_to_vamps=True)
        return ppd
Ejemplo n.º 3
0
def generate_demux_file(sl_out, **kwargs):
    """Creates the HDF5 demultiplexed file

    Parameters
    ----------
    sl_out : str
        Path to the output directory of split libraries
    kwargs: ignored
        Necessary to include to support execution via moi.

    Raises
    ------
    ValueError
        If the split libraries output does not contain the demultiplexed fastq
        file
    """
    from os.path import join, exists
    from h5py import File
    from qiita_ware.demux import to_hdf5

    fastq_fp = join(sl_out, 'seqs.fastq')
    if not exists(fastq_fp):
        raise ValueError("The split libraries output directory does not "
                         "contain the demultiplexed fastq file")

    demux_fp = join(sl_out, 'seqs.demux')
    with File(demux_fp, "w") as f:
        to_hdf5(fastq_fp, f)

    return demux_fp
Ejemplo n.º 4
0
    def write_demux_files(self, prep_template, generate_hdf5=True):
        """Writes a demux test file to avoid duplication of code"""
        fna_fp = join(self.temp_dir, 'seqs.fna')
        demux_fp = join(self.temp_dir, 'demux.seqs')
        if generate_hdf5:
            with open(fna_fp, 'w') as f:
                f.write(FASTA_EXAMPLE)
            with File(demux_fp, "w") as f:
                to_hdf5(fna_fp, f)
        else:
            with open(demux_fp, 'w') as f:
                f.write('')

        if prep_template.artifact is None:
            ppd = Artifact.create([(demux_fp, 6)],
                                  "Demultiplexed",
                                  prep_template=prep_template,
                                  can_be_submitted_to_ebi=True,
                                  can_be_submitted_to_vamps=True)
        else:
            params = Parameters.from_default_params(
                DefaultParameters(1),
                {'input_data': prep_template.artifact.id})
            ppd = Artifact.create([(demux_fp, 6)],
                                  "Demultiplexed",
                                  parents=[prep_template.artifact],
                                  processing_parameters=params,
                                  can_be_submitted_to_ebi=True,
                                  can_be_submitted_to_vamps=True)
        return ppd
Ejemplo n.º 5
0
def generate_demux_file(sl_out):
    """Creates the HDF5 demultiplexed file

    Parameters
    ----------
    sl_out : str
        Path to the output directory of split libraries

    Returns
    -------
    str
        The path of the demux file

    Raises
    ------
    ValueError
        If the split libraries output does not contain the demultiplexed fastq
        file
    """
    fastq_fp = str(join(sl_out, 'seqs.fastq'))
    if not exists(fastq_fp):
        raise ValueError("The split libraries output directory does not "
                         "contain the demultiplexed fastq file.")

    demux_fp = join(sl_out, 'seqs.demux')
    with File(demux_fp, "w") as f:
        to_hdf5(fastq_fp, f)
    return demux_fp
Ejemplo n.º 6
0
def generate_demux_file(sl_out, **kwargs):
    """Creates the HDF5 demultiplexed file

    Parameters
    ----------
    sl_out : str
        Path to the output directory of split libraries
    kwargs: ignored
        Necessary to include to support execution via moi.

    Raises
    ------
    ValueError
        If the split libraries output does not contain the demultiplexed fastq
        file
    """
    from os.path import join, exists
    from h5py import File
    from qiita_ware.demux import to_hdf5

    fastq_fp = join(sl_out, 'seqs.fastq')
    if not exists(fastq_fp):
        raise ValueError("The split libraries output directory does not "
                         "contain the demultiplexed fastq file")

    demux_fp = join(sl_out, 'seqs.demux')
    with File(demux_fp, "w") as f:
        to_hdf5(fastq_fp, f)

    return demux_fp
Ejemplo n.º 7
0
    def test_to_hdf5(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fna',
                                         delete=False) as f:
            f.write(seqdata)
            f.flush()
            f.close()

            to_hdf5(f.name, self.hdf5_file)
            self.to_remove.append(f.name)

        npt.assert_equal(self.hdf5_file['a/sequence'][:],
                         np.array(["x", "xy", "xyz"]))
        npt.assert_equal(self.hdf5_file['a/qual'][:],
                         np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]]))
        npt.assert_equal(self.hdf5_file['a/barcode/original'][:],
                         np.array(["abc", "aby", "abz"]))
        npt.assert_equal(self.hdf5_file['a/barcode/corrected'][:],
                         np.array(["abc", "ybc", "zbc"]))
        npt.assert_equal(self.hdf5_file['a/barcode/error'][:],
                         np.array([0, 2, 3]))

        npt.assert_equal(self.hdf5_file['b/sequence'][:],
                         np.array(["xyz", "abcd"]))
        npt.assert_equal(self.hdf5_file['b/qual'][:],
                         np.array([[0, 0, 0, 0], [0, 0, 0, 0]]))
        npt.assert_equal(self.hdf5_file['b/barcode/original'][:],
                         np.array(["abx", "abw"]))
        npt.assert_equal(self.hdf5_file['b/barcode/corrected'][:],
                         np.array(["xbc", "wbc"]))
        npt.assert_equal(self.hdf5_file['b/barcode/error'][:], np.array([1,
                                                                         4]))
Ejemplo n.º 8
0
    def test_to_hdf5(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fna',
                                         delete=False) as f:
            f.write(seqdata)

        self.to_remove.append(f.name)
        to_hdf5(f.name, self.hdf5_file)

        npt.assert_equal(self.hdf5_file['a/sequence'][:], np.array(["x", "xy",
                                                                    "xyz"]))
        npt.assert_equal(self.hdf5_file['a/qual'][:],
                         np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]]))
        npt.assert_equal(self.hdf5_file['a/barcode/original'][:],
                         np.array(["abc", "aby", "abz"]))
        npt.assert_equal(self.hdf5_file['a/barcode/corrected'][:],
                         np.array(["abc", "ybc", "zbc"]))
        npt.assert_equal(self.hdf5_file['a/barcode/error'][:],
                         np.array([0, 2, 3]))

        npt.assert_equal(self.hdf5_file['b/sequence'][:],
                         np.array(["xyz", "abcd"]))
        npt.assert_equal(self.hdf5_file['b/qual'][:],
                         np.array([[0, 0, 0, 0], [0, 0, 0, 0]]))
        npt.assert_equal(self.hdf5_file['b/barcode/original'][:],
                         np.array(["abx", "abw"]))
        npt.assert_equal(self.hdf5_file['b/barcode/corrected'][:],
                         np.array(["xbc", "wbc"]))
        npt.assert_equal(self.hdf5_file['b/barcode/error'][:],
                         np.array([1, 4]))
Ejemplo n.º 9
0
 def test_get(self):
     demux_fp = [fp for _, fp, fp_type in Artifact(2).filepaths
                 if fp_type == 'preprocessed_demux'][0]
     fd, fna_fp = mkstemp(suffix='_seqs.fna')
     close(fd)
     self._clean_up_files.extend([fna_fp, demux_fp])
     with open(fna_fp, 'w') as f:
         f.write('>a_1 X orig_bc=X new_bc=X bc_diffs=0\nCCC')
     with File(demux_fp, "w") as f:
         to_hdf5(fna_fp, f)
     BaseHandler.get_current_user = Mock(return_value=User("*****@*****.**"))
     response = self.get("/ebi_submission/2")
     self.assertEqual(response.code, 200)
Ejemplo n.º 10
0
    def test_to_ascii(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as f:
            f.write(fqdata)

        self.to_remove.append(f.name)
        to_hdf5(f.name, self.hdf5_file)

        exp = [b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\nABC\n",
               b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDFG\n",
               b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDEF\n"]

        obs = list(to_ascii(self.hdf5_file, samples=['a', 'b']))
        self.assertEqual(obs, exp)
Ejemplo n.º 11
0
    def test_to_ascii(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as f:
            f.write(fqdata)
            f.flush()
            f.close()
            to_hdf5(f.name, self.hdf5_file)
            self.to_remove.append(f.name)

        exp = [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\nABC\n"),
               (b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDFG\n"),
               (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDEF\n")]

        obs = list(to_ascii(self.hdf5_file, samples=['a', 'b']))
        self.assertEqual(obs, exp)
Ejemplo n.º 12
0
    def _generate_files(self):
        fd, fastq_fp = mkstemp(suffix=".fastq")
        close(fd)
        with open(fastq_fp, 'w') as f:
            f.write(FASTQ_SEQS)

        demux_fp = "%s.demux" % fastq_fp
        with File(demux_fp) as f:
            to_hdf5(fastq_fp, f)

        out_dir = mkdtemp()

        self._clean_up_files.extend([fastq_fp, demux_fp, out_dir])

        return demux_fp, fastq_fp, out_dir
Ejemplo n.º 13
0
    def _generate_files(self):
        fd, fastq_fp = mkstemp(suffix=".fastq")
        close(fd)
        with open(fastq_fp, 'w') as f:
            f.write(FASTQ_SEQS)

        demux_fp = "%s.demux" % fastq_fp
        with File(demux_fp) as f:
            to_hdf5(fastq_fp, f)

        out_dir = mkdtemp()

        self._clean_up_files.extend([fastq_fp, demux_fp, out_dir])

        return demux_fp, fastq_fp, out_dir
Ejemplo n.º 14
0
 def test_get(self):
     demux_fp = [
         fp for _, fp, fp_type in Artifact(2).filepaths
         if fp_type == 'preprocessed_demux'
     ][0]
     fd, fna_fp = mkstemp(suffix='_seqs.fna')
     close(fd)
     self._clean_up_files.extend([fna_fp, demux_fp])
     with open(fna_fp, 'w') as f:
         f.write('>a_1 X orig_bc=X new_bc=X bc_diffs=0\nCCC')
     with File(demux_fp, "w") as f:
         to_hdf5(fna_fp, f)
     BaseHandler.get_current_user = Mock(return_value=User("*****@*****.**"))
     response = self.get("/ebi_submission/2")
     self.assertEqual(response.code, 200)
Ejemplo n.º 15
0
    def test_to_ascii_fasta(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fna',
                                         delete=False) as f:
            f.write(seqdata)

        self.to_remove.append(f.name)
        to_hdf5(f.name, self.hdf5_file)

        exp = [b">a_0 orig_bc=abc new_bc=abc bc_diffs=0\nx\n",
               b">a_1 orig_bc=aby new_bc=ybc bc_diffs=2\nxy\n",
               b">a_2 orig_bc=abz new_bc=zbc bc_diffs=3\nxyz\n",
               b">b_0 orig_bc=abx new_bc=xbc bc_diffs=1\nxyz\n",
               b">b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nabcd\n"]

        obs = list(to_ascii(self.hdf5_file, samples=['a', 'b']))
        self.assertEqual(obs, exp)
Ejemplo n.º 16
0
    def test_to_per_sample_ascii(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as f:
            f.write(fqdata)

        self.to_remove.append(f.name)
        to_hdf5(f.name, self.hdf5_file)

        exp = [('a', [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n"
                       "ABC\n")]),
               ('b', [(b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n"
                       "DFG\n"),
                      (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n"
                       "DEF\n")])]

        obs = [(s[0], list(s[1])) for s in to_per_sample_ascii(self.hdf5_file)]
        self.assertEqual(obs, exp)
Ejemplo n.º 17
0
    def write_demux_files(self, prep_template, generate_hdf5=True):
        """Writes a demux test file to avoid duplication of code"""
        fna_fp = join(self.temp_dir, 'seqs.fna')
        demux_fp = join(self.temp_dir, 'demux.seqs')
        if generate_hdf5:
            with open(fna_fp, 'w') as f:
                f.write(FASTA_EXAMPLE)
            with File(demux_fp, "w") as f:
                to_hdf5(fna_fp, f)
        else:
            with open(demux_fp, 'w') as f:
                f.write('')

        ppd = PreprocessedData.create(Study(1),
                                      "preprocessed_sequence_illumina_params",
                                      1, [(demux_fp, 6)], prep_template)
        return ppd
Ejemplo n.º 18
0
    def test_to_per_sample_ascii(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as f:
            f.write(fqdata)

        self.to_remove.append(f.name)
        to_hdf5(f.name, self.hdf5_file)

        exp = [('a', [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n"
                       "ABC\n")]),
               ('b', [(b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n"
                       "DFG\n"),
                      (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n"
                       "DEF\n")])]

        obs = [(s[0], list(s[1])) for s in to_per_sample_ascii(self.hdf5_file)]
        self.assertEqual(obs, exp)
Ejemplo n.º 19
0
    def test_to_ascii_fasta(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fna',
                                         delete=False) as f:
            f.write(seqdata)

        self.to_remove.append(f.name)
        to_hdf5(f.name, self.hdf5_file)

        exp = [
            b">a_0 orig_bc=abc new_bc=abc bc_diffs=0\nx\n",
            b">a_1 orig_bc=aby new_bc=ybc bc_diffs=2\nxy\n",
            b">a_2 orig_bc=abz new_bc=zbc bc_diffs=3\nxyz\n",
            b">b_0 orig_bc=abx new_bc=xbc bc_diffs=1\nxyz\n",
            b">b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nabcd\n"
        ]

        obs = list(to_ascii(self.hdf5_file, samples=['a', 'b']))
        self.assertEqual(obs, exp)
Ejemplo n.º 20
0
    def test_fetch_qual_length_bug(self):
        # fetch was not trimming qual to the length of the sequence resulting
        # in qual scores for positions beyond the length of the sequence.
        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as f:
            f.write(fqdata_variable_length)

        self.to_remove.append(f.name)
        to_hdf5(f.name, self.hdf5_file)

        exp = [('a', [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n"
                       "ABC\n")]),
               ('b', [(b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n"
                       "DFG\n"),
                      (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n+\n"
                       "DEF#G\n")])]

        obs = [(s[0], list(s[1])) for s in to_per_sample_ascii(self.hdf5_file)]
        self.assertEqual(obs, exp)
Ejemplo n.º 21
0
    def test_fetch_qual_length_bug(self):
        # fetch was not trimming qual to the length of the sequence resulting
        # in qual scores for positions beyond the length of the sequence.
        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as f:
            f.write(fqdata_variable_length)

        self.to_remove.append(f.name)
        to_hdf5(f.name, self.hdf5_file)

        exp = [('a', [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n"
                       "ABC\n")]),
               ('b', [(b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n"
                       "DFG\n"),
                      (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n+\n"
                       "DEF#G\n")])]

        obs = [(s[0], list(s[1])) for s in to_per_sample_ascii(self.hdf5_file)]
        self.assertEqual(obs, exp)
Ejemplo n.º 22
0
    def test_to_ascii(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as f:
            f.write(fqdata)
            f.flush()
            f.close()
            to_hdf5(f.name, self.hdf5_file)
            self.to_remove.append(f.name)

        exp = [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n"
                "A\x00\x00\x00\x00\x00\x00\x00"
                "B\x00\x00\x00\x00\x00\x00\x00"
                "C\x00\x00\x00\x00\x00\x00\x00\n"),
               (b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n"
                "D\x00\x00\x00\x00\x00\x00\x00"
                "F\x00\x00\x00\x00\x00\x00\x00"
                "G\x00\x00\x00\x00\x00\x00\x00\n"),
               (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n"
                "D\x00\x00\x00\x00\x00\x00\x00"
                "E\x00\x00\x00\x00\x00\x00\x00"
                "F\x00\x00\x00\x00\x00\x00\x00\n")]

        obs = list(to_ascii(self.hdf5_file, samples=['a', 'b']))
        self.assertEqual(obs, exp)
Ejemplo n.º 23
0
    def generate_new_study_with_preprocessed_data(self):
        """Creates a new study up to the processed data for testing"""
        # ignoring warnings generated when adding templates
        simplefilter("ignore")
        info = {
            "timeseries_type_id": 1,
            "metadata_complete": True,
            "mixs_compliant": True,
            "number_samples_collected": 3,
            "number_samples_promised": 3,
            "study_alias": "Test EBI",
            "study_description": "Study for testing EBI",
            "study_abstract": "Study for testing EBI",
            "emp_person_id": StudyPerson(2),
            "principal_investigator_id": StudyPerson(3),
            "lab_person_id": StudyPerson(1)
        }
        study = Study.create(User('*****@*****.**'), "Test EBI study", [1], info)
        metadata_dict = {
            'Sample1': {'collection_timestamp': datetime(2015, 6, 1, 7, 0, 0),
                        'physical_specimen_location': 'location1',
                        'taxon_id': 9606,
                        'scientific_name': 'h**o sapiens',
                        'Description': 'Test Sample 1'},
            'Sample2': {'collection_timestamp': datetime(2015, 6, 2, 7, 0, 0),
                        'physical_specimen_location': 'location1',
                        'taxon_id': 9606,
                        'scientific_name': 'h**o sapiens',
                        'Description': 'Test Sample 2'},
            'Sample3': {'collection_timestamp': datetime(2015, 6, 3, 7, 0, 0),
                        'physical_specimen_location': 'location1',
                        'taxon_id': 9606,
                        'scientific_name': 'h**o sapiens',
                        'Description': 'Test Sample 3'}
        }
        metadata = pd.DataFrame.from_dict(metadata_dict, orient='index',
                                          dtype=str)
        SampleTemplate.create(metadata, study)
        metadata_dict = {
            'Sample1': {'primer': 'GTGCCAGCMGCCGCGGTAA',
                        'barcode': 'CGTAGAGCTCTC',
                        'center_name': 'KnightLab',
                        'platform': 'ILLUMINA',
                        'instrument_model': 'Illumina MiSeq',
                        'library_construction_protocol': 'Protocol ABC',
                        'experiment_design_description': "Random value 1"},
            'Sample2': {'primer': 'GTGCCAGCMGCCGCGGTAA',
                        'barcode': 'CGTAGAGCTCTA',
                        'center_name': 'KnightLab',
                        'platform': 'ILLUMINA',
                        'instrument_model': 'Illumina MiSeq',
                        'library_construction_protocol': 'Protocol ABC',
                        'experiment_design_description': "Random value 2"},
            'Sample3': {'primer': 'GTGCCAGCMGCCGCGGTAA',
                        'barcode': 'CGTAGAGCTCTT',
                        'center_name': 'KnightLab',
                        'platform': 'ILLUMINA',
                        'instrument_model': 'Illumina MiSeq',
                        'library_construction_protocol': 'Protocol ABC',
                        'experiment_design_description': "Random value 3"},
        }
        metadata = pd.DataFrame.from_dict(metadata_dict, orient='index',
                                          dtype=str)
        pt = PrepTemplate.create(metadata, study, "16S", 'Metagenomics')
        fna_fp = join(self.temp_dir, 'seqs.fna')
        demux_fp = join(self.temp_dir, 'demux.seqs')
        with open(fna_fp, 'w') as f:
            f.write(FASTA_EXAMPLE_2.format(study.id))
        with File(demux_fp, 'w') as f:
            to_hdf5(fna_fp, f)

        ppd = Artifact.create(
            [(demux_fp, 6)], "Demultiplexed", prep_template=pt)

        return ppd
Ejemplo n.º 24
0
def _validate_demultiplexed(qclient, job_id, prep_info, files, out_dir):
    """Validate and fix a new 'Demultiplexed' artifact

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    prep_info : dict of {str: dict of {str: str}}
        The prep information keyed by sample id
    files : dict of {str: list of str}
        The files to add to the new artifact, keyed by filepath type
    out_dir : str
        The output directory

    Returns
    -------
    dict
        The results of the job
    """
    qclient.update_job_step(job_id, "Step 2: Validating 'Demultiplexed' files")

    supported_fp_types = {'preprocessed_fasta', 'preprocessed_fastq',
                          'preprocessed_demux', 'log'}
    unsupported_fp_types = set(files) - supported_fp_types
    if unsupported_fp_types:
        return format_payload(
            success=False,
            error_msg="Filepath type(s) %s not supported by artifact type "
                      "Demultiplexed. Supported filepath types: %s"
                      % (', '.join(unsupported_fp_types),
                         ', '.join(sorted(supported_fp_types)))
        )

    # At most one file of each type can be provided
    offending = set(fp_t for fp_t, fps in files.items() if len(fps) > 1)
    if offending:
        errors = ["%s (%d): %s"
                  % (fp_t, len(files[fp_t]), ', '.join(files[fp_t]))
                  for fp_t in sorted(offending)]
        return format_payload(
            success=False,
            error_msg="Only one filepath of each file type is supported, "
                      "offending types:\n%s"
                      % "; ".join(errors))

    # Check which files we have available:
    fasta = (files['preprocessed_fasta'][0]
             if 'preprocessed_fasta' in files else None)
    fastq = (files['preprocessed_fastq'][0]
             if 'preprocessed_fastq' in files else None)
    demux = (files['preprocessed_demux'][0]
             if 'preprocessed_demux' in files else None)
    log = (files['log'][0] if 'log' in files else None)
    if demux:
        # If demux is available, use that one to perform the validation and
        # generate the fasta and fastq from it
        payload = _validate_demux_file(qclient, job_id, prep_info, out_dir,
                                       demux, log_fp=log)
    elif fastq:
        # Generate the demux file from the fastq
        demux = join(out_dir, "%s.demux" % splitext(basename(fastq))[0])
        with File(demux, "w") as f:
            to_hdf5(fastq, f)
        # Validate the demux, providing the original fastq
        payload = _validate_demux_file(qclient, job_id, prep_info, out_dir,
                                       demux, fastq_fp=fastq, log_fp=log)
    elif fasta:
        # Generate the demux file from the fasta
        demux = join(out_dir, "%s.demux" % splitext(basename(fasta))[0])
        with File(demux, "w") as f:
            to_hdf5(fasta, f)
        # Validate the demux, providing the original fasta
        payload = _validate_demux_file(qclient, job_id, prep_info, out_dir,
                                       demux, fasta_fp=fasta, log_fp=log)
    else:
        payload = format_payload(
            success=False,
            error_msg="Either a 'preprocessed_demux', 'preprocessed_fastq' or "
                      "'preprocessed_fasta' file should be provided.")

    return payload
Ejemplo n.º 25
0
    def generate_new_study_with_preprocessed_data(self):
        """Creates a new study up to the processed data for testing"""
        # ignoring warnings generated when adding templates
        simplefilter("ignore")
        info = {
            "timeseries_type_id": 1,
            "metadata_complete": True,
            "mixs_compliant": True,
            "number_samples_collected": 3,
            "number_samples_promised": 3,
            "study_alias": "Test EBI",
            "study_description": "Study for testing EBI",
            "study_abstract": "Study for testing EBI",
            "emp_person_id": StudyPerson(2),
            "principal_investigator_id": StudyPerson(3),
            "lab_person_id": StudyPerson(1)
        }
        study = Study.create(User('*****@*****.**'), "Test EBI study", [1], info)
        metadata_dict = {
            'Sample1': {
                'collection_timestamp': datetime(2015, 6, 1, 7, 0, 0),
                'physical_specimen_location': 'location1',
                'taxon_id': 9606,
                'scientific_name': 'h**o sapiens',
                'Description': 'Test Sample 1'
            },
            'Sample2': {
                'collection_timestamp': datetime(2015, 6, 2, 7, 0, 0),
                'physical_specimen_location': 'location1',
                'taxon_id': 9606,
                'scientific_name': 'h**o sapiens',
                'Description': 'Test Sample 2'
            },
            'Sample3': {
                'collection_timestamp': datetime(2015, 6, 3, 7, 0, 0),
                'physical_specimen_location': 'location1',
                'taxon_id': 9606,
                'scientific_name': 'h**o sapiens',
                'Description': 'Test Sample 3'
            }
        }
        metadata = pd.DataFrame.from_dict(metadata_dict, orient='index')
        SampleTemplate.create(metadata, study)
        metadata_dict = {
            'Sample1': {
                'primer': 'GTGCCAGCMGCCGCGGTAA',
                'barcode': 'CGTAGAGCTCTC',
                'center_name': 'KnightLab',
                'platform': 'ILLUMINA',
                'instrument_model': 'Illumina MiSeq',
                'library_construction_protocol': 'Protocol ABC',
                'experiment_design_description': "Random value 1"
            },
            'Sample2': {
                'primer': 'GTGCCAGCMGCCGCGGTAA',
                'barcode': 'CGTAGAGCTCTA',
                'center_name': 'KnightLab',
                'platform': 'ILLUMINA',
                'instrument_model': 'Illumina MiSeq',
                'library_construction_protocol': 'Protocol ABC',
                'experiment_design_description': "Random value 2"
            },
            'Sample3': {
                'primer': 'GTGCCAGCMGCCGCGGTAA',
                'barcode': 'CGTAGAGCTCTT',
                'center_name': 'KnightLab',
                'platform': 'ILLUMINA',
                'instrument_model': 'Illumina MiSeq',
                'library_construction_protocol': 'Protocol ABC',
                'experiment_design_description': "Random value 3"
            },
        }
        metadata = pd.DataFrame.from_dict(metadata_dict, orient='index')
        pt = PrepTemplate.create(metadata, study, "16S", 'Metagenomics')
        fna_fp = join(self.temp_dir, 'seqs.fna')
        demux_fp = join(self.temp_dir, 'demux.seqs')
        with open(fna_fp, 'w') as f:
            f.write(FASTA_EXAMPLE_2.format(study.id))
        with File(demux_fp, 'w') as f:
            to_hdf5(fna_fp, f)

        ppd = Artifact.create([(demux_fp, 6)],
                              "Demultiplexed",
                              prep_template=pt,
                              can_be_submitted_to_ebi=True,
                              can_be_submitted_to_vamps=True)

        return ppd
Ejemplo n.º 26
0
def _validate_demultiplexed(qclient, job_id, prep_info, files, out_dir):
    """Validate and fix a new 'Demultiplexed' artifact

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    prep_info : dict of {str: dict of {str: str}}
        The prep information keyed by sample id
    files : dict of {str: list of str}
        The files to add to the new artifact, keyed by filepath type
    out_dir : str
        The output directory

    Returns
    -------
    dict
        The results of the job
    """
    qclient.update_job_step(job_id, "Step 2: Validating 'Demultiplexed' files")

    supported_fp_types = {'preprocessed_fasta', 'preprocessed_fastq',
                          'preprocessed_demux', 'log'}
    unsupported_fp_types = set(files) - supported_fp_types
    if unsupported_fp_types:
        error_msg = ("Filepath type(s) %s not supported by artifact type "
                     "Demultiplexed. Supported filepath types: %s"
                     % (', '.join(unsupported_fp_types),
                        ', '.join(sorted(supported_fp_types))))
        return False, None, error_msg

    # At most one file of each type can be provided
    offending = set(fp_t for fp_t, fps in files.items() if len(fps) > 1)
    if offending:
        errors = ["%s (%d): %s"
                  % (fp_t, len(files[fp_t]), ', '.join(files[fp_t]))
                  for fp_t in sorted(offending)]
        error_msg = ("Only one filepath of each file type is supported, "
                     "offending types:\n%s" % "; ".join(errors))
        return False, None, error_msg

    # Check which files we have available:
    fasta = (files['preprocessed_fasta'][0]
             if 'preprocessed_fasta' in files else None)
    fastq = (files['preprocessed_fastq'][0]
             if 'preprocessed_fastq' in files else None)
    demux = (files['preprocessed_demux'][0]
             if 'preprocessed_demux' in files else None)
    log = (files['log'][0] if 'log' in files else None)
    if demux:
        # If demux is available, use that one to perform the validation and
        # generate the fasta and fastq from it
        success, a_info, error_msg = _validate_demux_file(
            qclient, job_id, prep_info, out_dir, demux, log_fp=log)
    elif fastq:
        # Generate the demux file from the fastq
        demux = join(out_dir, "%s.demux" % splitext(basename(fastq))[0])
        with File(demux, "w") as f:
            to_hdf5(fastq, f)
        # Validate the demux, providing the original fastq
        success, a_info, error_msg = _validate_demux_file(
            qclient, job_id, prep_info, out_dir, demux, fastq_fp=fastq,
            log_fp=log)
    elif fasta:
        # Generate the demux file from the fasta
        demux = join(out_dir, "%s.demux" % splitext(basename(fasta))[0])
        with File(demux, "w") as f:
            to_hdf5(fasta, f)
        # Validate the demux, providing the original fasta
        success, a_info, error_msg = _validate_demux_file(
            qclient, job_id, prep_info, out_dir, demux, fasta_fp=fasta,
            log_fp=log)
    else:
        error_msg = ("Either a 'preprocessed_demux', 'preprocessed_fastq' or "
                     "'preprocessed_fasta' file should be provided.")
        return False, None, error_msg

    return success, a_info, error_msg