Example #1
0
def generate_demux_file(sl_out):
    """Creates the HDF5 demultiplexed file

    Parameters
    ----------
    sl_out : str
        Path to the output directory of split libraries

    Returns
    -------
    str
        The path of the demux file

    Raises
    ------
    ValueError
        If the split libraries output does not contain the demultiplexed fastq
        file
    """
    fastq_fp = str(join(sl_out, 'seqs.fastq'))
    if not exists(fastq_fp):
        raise ValueError("The split libraries output directory does not "
                         "contain the demultiplexed fastq file.")
    elif stat(fastq_fp).st_size == 0:
        raise ValueError("No sequences were demuxed. Check your parameters.")

    demux_fp = join(sl_out, 'seqs.demux')
    with File(demux_fp, "w") as f:
        to_hdf5(fastq_fp, f)
    return demux_fp
Example #2
0
    def test_to_hdf5(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fna',
                                         delete=False) as f:
            f.write(seqdata)

        self.to_remove.append(f.name)
        to_hdf5(f.name, self.hdf5_file)

        npt.assert_equal(self.hdf5_file['a/sequence'][:],
                         np.array([b"x", b"xy", b"xyz"]))
        npt.assert_equal(self.hdf5_file['a/qual'][:],
                         np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]]))
        npt.assert_equal(self.hdf5_file['a/barcode/original'][:],
                         np.array([b"abc", b"aby", b"abz"]))
        npt.assert_equal(self.hdf5_file['a/barcode/corrected'][:],
                         np.array([b"abc", b"ybc", b"zbc"]))
        npt.assert_equal(self.hdf5_file['a/barcode/error'][:],
                         np.array([0, 2, 3]))

        npt.assert_equal(self.hdf5_file['b/sequence'][:],
                         np.array([b"xyz", b"abcd"]))
        npt.assert_equal(self.hdf5_file['b/qual'][:],
                         np.array([[0, 0, 0, 0], [0, 0, 0, 0]]))
        npt.assert_equal(self.hdf5_file['b/barcode/original'][:],
                         np.array([b"abx", b"abw"]))
        npt.assert_equal(self.hdf5_file['b/barcode/corrected'][:],
                         np.array([b"xbc", b"wbc"]))
        npt.assert_equal(self.hdf5_file['b/barcode/error'][:],
                         np.array([1, 4]))
Example #3
0
    def write_demux_files(self, prep_template, generate_hdf5=True):
        """Writes a demux test file to avoid duplication of code"""
        fna_fp = join(self.temp_dir, 'seqs.fna')
        demux_fp = join(self.temp_dir, 'demux.seqs')
        if generate_hdf5:
            with open(fna_fp, 'w') as f:
                f.write(FASTA_EXAMPLE)
            with File(demux_fp, "w") as f:
                to_hdf5(fna_fp, f)
        else:
            with open(demux_fp, 'w') as f:
                f.write('')

        if prep_template.artifact is None:
            ppd = Artifact.create([(demux_fp, 6)],
                                  "Demultiplexed",
                                  prep_template=prep_template)
        else:
            params = Parameters.from_default_params(
                DefaultParameters(1),
                {'input_data': prep_template.artifact.id})
            ppd = Artifact.create([(demux_fp, 6)],
                                  "Demultiplexed",
                                  parents=[prep_template.artifact],
                                  processing_parameters=params)
        return ppd
Example #4
0
    def test_to_per_sample_files(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as f:
            f.write(fqdata_variable_length)

        self.to_remove.append(f.name)

        with tempfile.NamedTemporaryFile('r+', suffix='.demux',
                                         delete=False) as demux_f:
            pass

        self.to_remove.append(demux_f.name)

        with h5py.File(demux_f.name, 'w') as demux:
            to_hdf5(f.name, demux)

        tmp_dir = tempfile.mkdtemp()
        self.to_remove.append(tmp_dir)
        path_builder = partial(os.path.join, tmp_dir)

        # Test to fastq
        to_per_sample_files(demux_f.name, out_dir=tmp_dir, n_jobs=1,
                            out_format='fastq')
        sample_a_path = path_builder("a.fastq")
        sample_b_path = path_builder("b.fastq")
        self.assertTrue(os.path.exists(sample_a_path))
        self.assertTrue(os.path.exists(sample_b_path))

        with open(sample_a_path, 'rb') as af:
            obs = af.read()
        self.assertEqual(
            obs, b'@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\nABC\n')

        with open(sample_b_path, 'rb') as bf:
            obs = bf.read()
        self.assertEqual(
            obs, b'@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDFG\n'
                 b'@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n+\nDEF#G\n')

        # Test to fasta and parallel
        to_per_sample_files(demux_f.name, out_dir=tmp_dir, n_jobs=2,
                            out_format='fasta')

        sample_a_path = path_builder("a.fna")
        sample_b_path = path_builder("b.fna")
        self.assertTrue(os.path.exists(sample_a_path))
        self.assertTrue(os.path.exists(sample_b_path))

        with open(sample_a_path, 'rb') as af:
            obs = af.read()
        self.assertEqual(
            obs, b'>a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n')

        with open(sample_b_path, 'rb') as bf:
            obs = bf.read()
        self.assertEqual(
            obs, b'>b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n'
                 b'>b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n')
Example #5
0
    def test_to_ascii_file(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as f:
            f.write(fqdata_variable_length)

        self.to_remove.append(f.name)

        with tempfile.NamedTemporaryFile('r+', suffix='.demux',
                                         delete=False) as demux_f:
            pass

        self.to_remove.append(demux_f.name)

        with h5py.File(demux_f.name, 'r+') as demux:
            to_hdf5(f.name, demux)

        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as obs_fq:
            pass
        self.to_remove.append(obs_fq.name)

        to_ascii_file(demux_f.name, obs_fq.name)
        with open(obs_fq.name, 'rb') as obs_f:
            obs = obs_f.read()
        exp = (b'@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\nABC\n'
               b'@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDFG\n'
               b'@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n+\nDEF#G\n')
        self.assertEqual(obs, exp)

        with tempfile.NamedTemporaryFile('r+', suffix='.fa',
                                         delete=False) as obs_fa:
            pass
        self.to_remove.append(obs_fa.name)

        to_ascii_file(demux_f.name, obs_fa.name, out_format='fasta')
        with open(obs_fa.name, 'rb') as obs_f:
            obs = obs_f.read()
        exp = (b'>a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n'
               b'>b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n'
               b'>b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n')
        self.assertEqual(obs, exp)

        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as obs_fq:
            pass
        self.to_remove.append(obs_fq.name)

        to_ascii_file(demux_f.name, obs_fq.name, samples=['b'])
        with open(obs_fq.name, 'rb') as obs_f:
            obs = obs_f.read()
        exp = (b'@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDFG\n'
               b'@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n+\nDEF#G\n')
        self.assertEqual(obs, exp)
Example #6
0
 def test_get(self):
     demux_fp = [fp for _, fp, fp_type in Artifact(2).filepaths
                 if fp_type == 'preprocessed_demux'][0]
     fd, fna_fp = mkstemp(suffix='_seqs.fna')
     close(fd)
     self._clean_up_files.extend([fna_fp, demux_fp])
     with open(fna_fp, 'w') as f:
         f.write('>a_1 X orig_bc=X new_bc=X bc_diffs=0\nCCC')
     with File(demux_fp, "w") as f:
         to_hdf5(fna_fp, f)
     BaseHandler.get_current_user = Mock(return_value=User("*****@*****.**"))
     response = self.get("/ebi_submission/2")
     self.assertEqual(response.code, 200)
Example #7
0
 def test_get(self):
     demux_fp = [fp for _, fp, fp_type in Artifact(2).filepaths
                 if fp_type == 'preprocessed_demux'][0]
     fd, fna_fp = mkstemp(suffix='_seqs.fna')
     close(fd)
     self._clean_up_files.extend([fna_fp, demux_fp])
     with open(fna_fp, 'w') as f:
         f.write('>a_1 X orig_bc=X new_bc=X bc_diffs=0\nCCC')
     with File(demux_fp, "w") as f:
         to_hdf5(fna_fp, f)
     BaseHandler.get_current_user = Mock(return_value=User("*****@*****.**"))
     response = self.get("/ebi_submission/2")
     self.assertEqual(response.code, 200)
Example #8
0
    def test_to_ascii(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as f:
            f.write(fqdata)

        self.to_remove.append(f.name)
        to_hdf5(f.name, self.hdf5_file)

        exp = [b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\nABC\n",
               b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDFG\n",
               b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDEF\n"]

        obs = list(to_ascii(self.hdf5_file, samples=[b'a', b'b']))
        self.assertEqual(obs, exp)
    def _generate_files(self, sample_names):
        fd, fastq_fp = mkstemp(suffix=".fastq")
        close(fd)
        with open(fastq_fp, 'w') as f:
            f.write(FASTQ_SEQS.format(**sample_names))

        demux_fp = "%s.demux" % fastq_fp
        with File(demux_fp) as f:
            to_hdf5(fastq_fp, f)

        out_dir = mkdtemp()

        self._clean_up_files.extend([fastq_fp, demux_fp, out_dir])

        return demux_fp, fastq_fp, out_dir
Example #10
0
    def _generate_files(self, sample_names):
        fd, fastq_fp = mkstemp(suffix=".fastq")
        close(fd)
        with open(fastq_fp, 'w') as f:
            f.write(FASTQ_SEQS.format(**sample_names))

        demux_fp = "%s.demux" % fastq_fp
        with File(demux_fp, 'w') as f:
            to_hdf5(fastq_fp, f)

        out_dir = mkdtemp()

        self._clean_up_files.extend([fastq_fp, demux_fp, out_dir])

        return demux_fp, fastq_fp, out_dir
Example #11
0
    def test_to_ascii_fasta(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fna',
                                         delete=False) as f:
            f.write(seqdata)

        self.to_remove.append(f.name)
        to_hdf5(f.name, self.hdf5_file)

        exp = [b">a_0 orig_bc=abc new_bc=abc bc_diffs=0\nx\n",
               b">a_1 orig_bc=aby new_bc=ybc bc_diffs=2\nxy\n",
               b">a_2 orig_bc=abz new_bc=zbc bc_diffs=3\nxyz\n",
               b">b_0 orig_bc=abx new_bc=xbc bc_diffs=1\nxyz\n",
               b">b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nabcd\n"]

        obs = list(to_ascii(self.hdf5_file, samples=[b'a', b'b']))
        self.assertEqual(obs, exp)
Example #12
0
    def test_to_per_sample_ascii(self):
        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as f:
            f.write(fqdata)

        self.to_remove.append(f.name)
        to_hdf5(f.name, self.hdf5_file)

        exp = [(b'a', [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n"
                        b"ABC\n")]),
               (b'b', [(b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n"
                        b"DFG\n"),
                       (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n"
                        b"DEF\n")])]

        obs = [(s[0], list(s[1])) for s in to_per_sample_ascii(self.hdf5_file)]
        self.assertEqual(obs, exp)
Example #13
0
    def test_submit_to_EBI(self):
        # setting up test
        fna_fp = join(self.temp_dir, 'seqs.fna')
        demux_fp = join(self.temp_dir, 'demux.seqs')
        with open(fna_fp, 'w') as f:
            f.write(FASTA_EXAMPLE)
        with File(demux_fp, "w") as f:
            to_hdf5(fna_fp, f)

        pt = PrepTemplate(1)
        params = Parameters.from_default_params(DefaultParameters(1),
                                                {'input_data': pt.artifact.id})
        artifact = Artifact.create([(demux_fp, 6)],
                                   "Demultiplexed",
                                   parents=[pt.artifact],
                                   processing_parameters=params)

        # submit job
        job = self._create_job('submit_to_EBI', {
            'artifact': artifact.id,
            'submission_type': 'VALIDATE'
        })
        job._set_status('in_construction')
        job.submit()

        # wait for the job to fail, and check that the status is submitting
        checked_submitting = True
        while job.status != 'error':
            if checked_submitting:
                self.assertEqual('submitting',
                                 artifact.study.ebi_submission_status)
                checked_submitting = False
        # once it fails wait for a few to check status again
        sleep(5)
        exp = 'Some artifact submissions failed: %d' % artifact.id
        obs = artifact.study.ebi_submission_status
        self.assertEqual(obs, exp)
        # make sure that the error is correct, we have 2 options
        if environ.get('ASPERA_SCP_PASS', '') != '':
            self.assertIn('1.SKM2.640199', job.log.msg)
        else:
            self.assertIn('ASCP Error:', job.log.msg)
        # wait for everything to finish to avoid DB deadlocks
        sleep(5)
Example #14
0
    def test_fetch_qual_length_bug(self):
        # fetch was not trimming qual to the length of the sequence resulting
        # in qual scores for positions beyond the length of the sequence.
        with tempfile.NamedTemporaryFile('r+', suffix='.fq',
                                         delete=False) as f:
            f.write(fqdata_variable_length)

        self.to_remove.append(f.name)
        to_hdf5(f.name, self.hdf5_file)

        exp = [(b'a', [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n"
                        b"ABC\n")]),
               (b'b', [(b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n"
                        b"DFG\n"),
                       (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n+\n"
                        b"DEF#G\n")])]

        obs = [(s[0], list(s[1])) for s in to_per_sample_ascii(self.hdf5_file)]
        self.assertEqual(obs, exp)
Example #15
0
    def test_submit_to_EBI(self):
        # setting up test
        fna_fp = join(self.temp_dir, 'seqs.fna')
        demux_fp = join(self.temp_dir, 'demux.seqs')
        with open(fna_fp, 'w') as f:
            f.write(FASTA_EXAMPLE)
        with File(demux_fp, "w") as f:
            to_hdf5(fna_fp, f)

        pt = PrepTemplate(1)
        params = Parameters.from_default_params(
            DefaultParameters(1), {'input_data': pt.artifact.id})
        artifact = Artifact.create(
            [(demux_fp, 6)], "Demultiplexed", parents=[pt.artifact],
            processing_parameters=params)

        # submit job
        job = self._create_job('submit_to_EBI', {
            'artifact': artifact.id, 'submission_type': 'VALIDATE'})
        job._set_status('in_construction')
        job.submit()

        # wait for the job to fail, and check that the status is submitting
        checked_submitting = True
        while job.status != 'error':
            if checked_submitting:
                self.assertEqual('submitting',
                                 artifact.study.ebi_submission_status)
                checked_submitting = False
        # once it fails wait for a few to check status again
        sleep(5)
        exp = 'Some artifact submissions failed: %d' % artifact.id
        obs = artifact.study.ebi_submission_status
        self.assertEqual(obs, exp)
        # make sure that the error is correct, we have 2 options
        if environ.get('ASPERA_SCP_PASS', '') != '':
            self.assertIn('1.SKM2.640199', job.log.msg)
        else:
            self.assertIn('ASCP Error:', job.log.msg)
        # wait for everything to finish to avoid DB deadlocks
        sleep(5)
Example #16
0
    def write_demux_files(self, prep_template, generate_hdf5=True):
        """Writes a demux test file to avoid duplication of code"""
        fna_fp = join(self.temp_dir, "seqs.fna")
        demux_fp = join(self.temp_dir, "demux.seqs")
        if generate_hdf5:
            with open(fna_fp, "w") as f:
                f.write(FASTA_EXAMPLE)
            with File(demux_fp, "w") as f:
                to_hdf5(fna_fp, f)
        else:
            with open(demux_fp, "w") as f:
                f.write("")

        if prep_template.artifact is None:
            ppd = Artifact.create([(demux_fp, 6)], "Demultiplexed", prep_template=prep_template)
        else:
            params = Parameters.from_default_params(DefaultParameters(1), {"input_data": prep_template.artifact.id})
            ppd = Artifact.create(
                [(demux_fp, 6)], "Demultiplexed", parents=[prep_template.artifact], processing_parameters=params
            )
        return ppd
Example #17
0
def _validate_demultiplexed(qclient, job_id, prep_info, files, out_dir):
    """Validate and fix a new 'Demultiplexed' artifact

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    prep_info : dict of {str: dict of {str: str}}
        The prep information keyed by sample id
    files : dict of {str: list of str}
        The files to add to the new artifact, keyed by filepath type
    out_dir : str
        The output directory

    Returns
    -------
    dict
        The results of the job
    """
    qclient.update_job_step(job_id, "Step 2: Validating 'Demultiplexed' files")

    supported_fp_types = {
        'preprocessed_fasta', 'preprocessed_fastq', 'preprocessed_demux', 'log'
    }
    unsupported_fp_types = set(files) - supported_fp_types
    if unsupported_fp_types:
        error_msg = ("Filepath type(s) %s not supported by artifact type "
                     "Demultiplexed. Supported filepath types: %s" %
                     (', '.join(unsupported_fp_types), ', '.join(
                         sorted(supported_fp_types))))
        return False, None, error_msg

    # At most one file of each type can be provided
    offending = set(fp_t for fp_t, fps in files.items() if len(fps) > 1)
    if offending:
        errors = [
            "%s (%d): %s" % (fp_t, len(files[fp_t]), ', '.join(files[fp_t]))
            for fp_t in sorted(offending)
        ]
        error_msg = ("Only one filepath of each file type is supported, "
                     "offending types:\n%s" % "; ".join(errors))
        return False, None, error_msg

    # Check which files we have available:
    fasta = (files['preprocessed_fasta'][0]
             if 'preprocessed_fasta' in files else None)
    fastq = (files['preprocessed_fastq'][0]
             if 'preprocessed_fastq' in files else None)
    demux = (files['preprocessed_demux'][0]
             if 'preprocessed_demux' in files else None)
    log = (files['log'][0] if 'log' in files else None)
    if demux:
        # If demux is available, use that one to perform the validation and
        # generate the fasta and fastq from it
        success, a_info, error_msg = _validate_demux_file(qclient,
                                                          job_id,
                                                          prep_info,
                                                          out_dir,
                                                          demux,
                                                          log_fp=log)
    elif fastq:
        # Generate the demux file from the fastq
        demux = join(out_dir, "%s.demux" % splitext(basename(fastq))[0])
        with File(demux, 'w') as f:
            # to_hdf5 expects a list
            to_hdf5([fastq], f)
        # Validate the demux, providing the original fastq
        success, a_info, error_msg = _validate_demux_file(qclient,
                                                          job_id,
                                                          prep_info,
                                                          out_dir,
                                                          demux,
                                                          fastq_fp=fastq,
                                                          log_fp=log)
    elif fasta:
        # Generate the demux file from the fasta
        demux = join(out_dir, "%s.demux" % splitext(basename(fasta))[0])
        with File(demux, 'w') as f:
            # to_hdf5 expects a list
            to_hdf5([fasta], f)
        # Validate the demux, providing the original fasta
        success, a_info, error_msg = _validate_demux_file(qclient,
                                                          job_id,
                                                          prep_info,
                                                          out_dir,
                                                          demux,
                                                          fasta_fp=fasta,
                                                          log_fp=log)
    else:
        error_msg = ("Either a 'preprocessed_demux', 'preprocessed_fastq' or "
                     "'preprocessed_fasta' file should be provided.")
        return False, None, error_msg

    return success, a_info, error_msg
Example #18
0
    def generate_new_study_with_preprocessed_data(self):
        """Creates a new study up to the processed data for testing"""
        info = {
            "timeseries_type_id": 1,
            "metadata_complete": True,
            "mixs_compliant": True,
            "number_samples_collected": 3,
            "number_samples_promised": 3,
            "study_alias": "Test EBI",
            "study_description": "Study for testing EBI",
            "study_abstract": "Study for testing EBI",
            "emp_person_id": StudyPerson(2),
            "principal_investigator_id": StudyPerson(3),
            "lab_person_id": StudyPerson(1)
        }
        study = Study.create(User('*****@*****.**'), "Test EBI study", info)
        metadata_dict = {
            'Sample1': {
                'collection_timestamp': datetime(2015, 6, 1, 7, 0, 0),
                'physical_specimen_location': 'location1',
                'taxon_id': 9606,
                'scientific_name': 'h**o sapiens',
                'Description': 'Test Sample 1'
            },
            'Sample2': {
                'collection_timestamp': datetime(2015, 6, 2, 7, 0, 0),
                'physical_specimen_location': 'location1',
                'taxon_id': 9606,
                'scientific_name': 'h**o sapiens',
                'Description': 'Test Sample 2'
            },
            'Sample3': {
                'collection_timestamp': datetime(2015, 6, 3, 7, 0, 0),
                'physical_specimen_location': 'location1',
                'taxon_id': 9606,
                'scientific_name': 'h**o sapiens',
                'Description': 'Test Sample 3'
            }
        }
        metadata = pd.DataFrame.from_dict(metadata_dict,
                                          orient='index',
                                          dtype=str)
        SampleTemplate.create(metadata, study)
        metadata_dict = {
            'Sample1': {
                'primer': 'GTGCCAGCMGCCGCGGTAA',
                'barcode': 'CGTAGAGCTCTC',
                'center_name': 'KnightLab',
                'platform': 'ILLUMINA',
                'instrument_model': 'Illumina MiSeq',
                'library_construction_protocol': 'Protocol ABC',
                'experiment_design_description': "Random value 1"
            },
            'Sample2': {
                'primer': 'GTGCCAGCMGCCGCGGTAA',
                'barcode': 'CGTAGAGCTCTA',
                'center_name': 'KnightLab',
                'platform': 'ILLUMINA',
                'instrument_model': 'Illumina MiSeq',
                'library_construction_protocol': 'Protocol ABC',
                'experiment_design_description': "Random value 2"
            },
            'Sample3': {
                'primer': 'GTGCCAGCMGCCGCGGTAA',
                'barcode': 'CGTAGAGCTCTT',
                'center_name': 'KnightLab',
                'platform': 'ILLUMINA',
                'instrument_model': 'Illumina MiSeq',
                'library_construction_protocol': 'Protocol ABC',
                'experiment_design_description': "Random value 3"
            },
        }
        metadata = pd.DataFrame.from_dict(metadata_dict,
                                          orient='index',
                                          dtype=str)
        pt = PrepTemplate.create(metadata, study, "16S", 'Metagenomics')
        fna_fp = join(self.temp_dir, 'seqs.fna')
        demux_fp = join(self.temp_dir, 'demux.seqs')
        with open(fna_fp, 'w') as f:
            f.write(FASTA_EXAMPLE_2.format(study.id))
        with File(demux_fp, 'w') as f:
            to_hdf5(fna_fp, f)

        ppd = Artifact.create([(demux_fp, 6)],
                              "Demultiplexed",
                              prep_template=pt)

        return ppd
Example #19
0
    def generate_new_study_with_preprocessed_data(self):
        """Creates a new study up to the processed data for testing"""
        info = {
            "timeseries_type_id": 1,
            "metadata_complete": True,
            "mixs_compliant": True,
            "number_samples_collected": 3,
            "number_samples_promised": 3,
            "study_alias": "Test EBI",
            "study_description": "Study for testing EBI",
            "study_abstract": "Study for testing EBI",
            "emp_person_id": StudyPerson(2),
            "principal_investigator_id": StudyPerson(3),
            "lab_person_id": StudyPerson(1)
        }
        study = Study.create(User('*****@*****.**'), "Test EBI study", info)
        metadata_dict = {
            'Sample1': {'collection_timestamp': datetime(2015, 6, 1, 7, 0, 0),
                        'physical_specimen_location': 'location1',
                        'taxon_id': 9606,
                        'scientific_name': 'h**o sapiens',
                        'Description': 'Test Sample 1'},
            'Sample2': {'collection_timestamp': datetime(2015, 6, 2, 7, 0, 0),
                        'physical_specimen_location': 'location1',
                        'taxon_id': 9606,
                        'scientific_name': 'h**o sapiens',
                        'Description': 'Test Sample 2'},
            'Sample3': {'collection_timestamp': datetime(2015, 6, 3, 7, 0, 0),
                        'physical_specimen_location': 'location1',
                        'taxon_id': 9606,
                        'scientific_name': 'h**o sapiens',
                        'Description': 'Test Sample 3'}
        }
        metadata = pd.DataFrame.from_dict(metadata_dict, orient='index',
                                          dtype=str)
        SampleTemplate.create(metadata, study)
        metadata_dict = {
            'Sample1': {'primer': 'GTGCCAGCMGCCGCGGTAA',
                        'barcode': 'CGTAGAGCTCTC',
                        'center_name': 'KnightLab',
                        'platform': 'ILLUMINA',
                        'instrument_model': 'Illumina MiSeq',
                        'library_construction_protocol': 'Protocol ABC',
                        'experiment_design_description': "Random value 1"},
            'Sample2': {'primer': 'GTGCCAGCMGCCGCGGTAA',
                        'barcode': 'CGTAGAGCTCTA',
                        'center_name': 'KnightLab',
                        'platform': 'ILLUMINA',
                        'instrument_model': 'Illumina MiSeq',
                        'library_construction_protocol': 'Protocol ABC',
                        'experiment_design_description': "Random value 2"},
            'Sample3': {'primer': 'GTGCCAGCMGCCGCGGTAA',
                        'barcode': 'CGTAGAGCTCTT',
                        'center_name': 'KnightLab',
                        'platform': 'ILLUMINA',
                        'instrument_model': 'Illumina MiSeq',
                        'library_construction_protocol': 'Protocol ABC',
                        'experiment_design_description': "Random value 3"},
        }
        metadata = pd.DataFrame.from_dict(metadata_dict, orient='index',
                                          dtype=str)
        pt = PrepTemplate.create(metadata, study, "16S", 'Metagenomics')
        fna_fp = join(self.temp_dir, 'seqs.fna')
        demux_fp = join(self.temp_dir, 'demux.seqs')
        with open(fna_fp, 'w') as f:
            f.write(FASTA_EXAMPLE_2.format(study.id))
        with File(demux_fp, 'w') as f:
            to_hdf5(fna_fp, f)

        ppd = Artifact.create(
            [(demux_fp, 6)], "Demultiplexed", prep_template=pt)

        return ppd
Example #20
0
def _validate_demultiplexed(qclient, job_id, prep_info, files, out_dir):
    """Validate and fix a new 'Demultiplexed' artifact

    Parameters
    ----------
    qclient : qiita_client.QiitaClient
        The Qiita server client
    job_id : str
        The job id
    prep_info : dict of {str: dict of {str: str}}
        The prep information keyed by sample id
    files : dict of {str: list of str}
        The files to add to the new artifact, keyed by filepath type
    out_dir : str
        The output directory

    Returns
    -------
    dict
        The results of the job
    """
    qclient.update_job_step(job_id, "Step 2: Validating 'Demultiplexed' files")

    supported_fp_types = {'preprocessed_fasta', 'preprocessed_fastq',
                          'preprocessed_demux', 'log'}
    unsupported_fp_types = set(files) - supported_fp_types
    if unsupported_fp_types:
        error_msg = ("Filepath type(s) %s not supported by artifact type "
                     "Demultiplexed. Supported filepath types: %s"
                     % (', '.join(unsupported_fp_types),
                        ', '.join(sorted(supported_fp_types))))
        return False, None, error_msg

    # At most one file of each type can be provided
    offending = set(fp_t for fp_t, fps in files.items() if len(fps) > 1)
    if offending:
        errors = ["%s (%d): %s"
                  % (fp_t, len(files[fp_t]), ', '.join(files[fp_t]))
                  for fp_t in sorted(offending)]
        error_msg = ("Only one filepath of each file type is supported, "
                     "offending types:\n%s" % "; ".join(errors))
        return False, None, error_msg

    # Check which files we have available:
    fasta = (files['preprocessed_fasta'][0]
             if 'preprocessed_fasta' in files else None)
    fastq = (files['preprocessed_fastq'][0]
             if 'preprocessed_fastq' in files else None)
    demux = (files['preprocessed_demux'][0]
             if 'preprocessed_demux' in files else None)
    log = (files['log'][0] if 'log' in files else None)
    if demux:
        # If demux is available, use that one to perform the validation and
        # generate the fasta and fastq from it
        success, a_info, error_msg = _validate_demux_file(
            qclient, job_id, prep_info, out_dir, demux, log_fp=log)
    elif fastq:
        # Generate the demux file from the fastq
        demux = join(out_dir, "%s.demux" % splitext(basename(fastq))[0])
        with open_file(demux, "w") as f:
            to_hdf5(fastq, f)
        # Validate the demux, providing the original fastq
        success, a_info, error_msg = _validate_demux_file(
            qclient, job_id, prep_info, out_dir, demux, fastq_fp=fastq,
            log_fp=log)
    elif fasta:
        # Generate the demux file from the fasta
        demux = join(out_dir, "%s.demux" % splitext(basename(fasta))[0])
        with open_file(demux, "w") as f:
            to_hdf5(fasta, f)
        # Validate the demux, providing the original fasta
        success, a_info, error_msg = _validate_demux_file(
            qclient, job_id, prep_info, out_dir, demux, fasta_fp=fasta,
            log_fp=log)
    else:
        error_msg = ("Either a 'preprocessed_demux', 'preprocessed_fastq' or "
                     "'preprocessed_fasta' file should be provided.")
        return False, None, error_msg

    return success, a_info, error_msg
Example #21
0
    def generate_new_study_with_preprocessed_data(self):
        """Creates a new study up to the processed data for testing"""
        info = {
            "timeseries_type_id": 1,
            "metadata_complete": True,
            "mixs_compliant": True,
            "number_samples_collected": 3,
            "number_samples_promised": 3,
            "study_alias": "Test EBI",
            "study_description": "Study for testing EBI",
            "study_abstract": "Study for testing EBI",
            "emp_person_id": StudyPerson(2),
            "principal_investigator_id": StudyPerson(3),
            "lab_person_id": StudyPerson(1),
        }
        study = Study.create(User("*****@*****.**"), "Test EBI study", [1], info)
        metadata_dict = {
            "Sample1": {
                "collection_timestamp": datetime(2015, 6, 1, 7, 0, 0),
                "physical_specimen_location": "location1",
                "taxon_id": 9606,
                "scientific_name": "h**o sapiens",
                "Description": "Test Sample 1",
            },
            "Sample2": {
                "collection_timestamp": datetime(2015, 6, 2, 7, 0, 0),
                "physical_specimen_location": "location1",
                "taxon_id": 9606,
                "scientific_name": "h**o sapiens",
                "Description": "Test Sample 2",
            },
            "Sample3": {
                "collection_timestamp": datetime(2015, 6, 3, 7, 0, 0),
                "physical_specimen_location": "location1",
                "taxon_id": 9606,
                "scientific_name": "h**o sapiens",
                "Description": "Test Sample 3",
            },
        }
        metadata = pd.DataFrame.from_dict(metadata_dict, orient="index", dtype=str)
        SampleTemplate.create(metadata, study)
        metadata_dict = {
            "Sample1": {
                "primer": "GTGCCAGCMGCCGCGGTAA",
                "barcode": "CGTAGAGCTCTC",
                "center_name": "KnightLab",
                "platform": "ILLUMINA",
                "instrument_model": "Illumina MiSeq",
                "library_construction_protocol": "Protocol ABC",
                "experiment_design_description": "Random value 1",
            },
            "Sample2": {
                "primer": "GTGCCAGCMGCCGCGGTAA",
                "barcode": "CGTAGAGCTCTA",
                "center_name": "KnightLab",
                "platform": "ILLUMINA",
                "instrument_model": "Illumina MiSeq",
                "library_construction_protocol": "Protocol ABC",
                "experiment_design_description": "Random value 2",
            },
            "Sample3": {
                "primer": "GTGCCAGCMGCCGCGGTAA",
                "barcode": "CGTAGAGCTCTT",
                "center_name": "KnightLab",
                "platform": "ILLUMINA",
                "instrument_model": "Illumina MiSeq",
                "library_construction_protocol": "Protocol ABC",
                "experiment_design_description": "Random value 3",
            },
        }
        metadata = pd.DataFrame.from_dict(metadata_dict, orient="index", dtype=str)
        pt = PrepTemplate.create(metadata, study, "16S", "Metagenomics")
        fna_fp = join(self.temp_dir, "seqs.fna")
        demux_fp = join(self.temp_dir, "demux.seqs")
        with open(fna_fp, "w") as f:
            f.write(FASTA_EXAMPLE_2.format(study.id))
        with File(demux_fp, "w") as f:
            to_hdf5(fna_fp, f)

        ppd = Artifact.create([(demux_fp, 6)], "Demultiplexed", prep_template=pt)

        return ppd