def test_to_per_sample_files(self): with tempfile.NamedTemporaryFile('r+', suffix='.fq', delete=False) as f: f.write(fqdata_variable_length) self.to_remove.append(f.name) with tempfile.NamedTemporaryFile('r+', suffix='.demux', delete=False) as demux_f: pass self.to_remove.append(demux_f.name) with h5py.File(demux_f.name, 'w') as demux: to_hdf5(f.name, demux) tmp_dir = tempfile.mkdtemp() self.to_remove.append(tmp_dir) path_builder = partial(os.path.join, tmp_dir) # Test to fastq to_per_sample_files(demux_f.name, out_dir=tmp_dir, n_jobs=1, out_format='fastq') sample_a_path = path_builder("a.fastq") sample_b_path = path_builder("b.fastq") self.assertTrue(os.path.exists(sample_a_path)) self.assertTrue(os.path.exists(sample_b_path)) with open(sample_a_path, 'rb') as af: obs = af.read() self.assertEqual( obs, b'@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\nABC\n') with open(sample_b_path, 'rb') as bf: obs = bf.read() self.assertEqual( obs, b'@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDFG\n' b'@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n+\nDEF#G\n') # Test to fasta and parallel to_per_sample_files(demux_f.name, out_dir=tmp_dir, n_jobs=2, out_format='fasta') sample_a_path = path_builder("a.fna") sample_b_path = path_builder("b.fna") self.assertTrue(os.path.exists(sample_a_path)) self.assertTrue(os.path.exists(sample_b_path)) with open(sample_a_path, 'rb') as af: obs = af.read() self.assertEqual( obs, b'>a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n') with open(sample_b_path, 'rb') as bf: obs = bf.read() self.assertEqual( obs, b'>b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n' b'>b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n')
def deblur(qclient, job_id, parameters, out_dir): """Run deblur with the given parameters Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to run deblur out_dir : str The path to the job's output directory Returns ------- boolean, list, str The results of the job Notes ----- The code will check if the artifact has a preprocessed_demux element, if not it will use the preprocessed_fastq. We prefer to work with the preprocessed_demux as running time will be greatly improved """ out_dir = join(out_dir, 'deblur_out') # Step 1 get the rest of the information need to run deblur qclient.update_job_step(job_id, "Step 1 of 4: Collecting information") artifact_id = parameters['Demultiplexed sequences'] # removing input from parameters so it's not part of the final command del parameters['Demultiplexed sequences'] # Get the artifact filepath information artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) fps = artifact_info['files'] # Step 2 generating command deblur if 'preprocessed_demux' in fps: qclient.update_job_step(job_id, "Step 2 of 4: Generating per sample " "from demux (1/2)") if not exists(out_dir): mkdir(out_dir) split_out_dir = join(out_dir, 'split') if not exists(split_out_dir): mkdir(split_out_dir) # using the same number of parallel jobs as defined by the command n_jobs = int(parameters['Jobs to start']) # [0] cause there should be only 1 file to_per_sample_files(fps['preprocessed_demux'][0], out_dir=split_out_dir, n_jobs=n_jobs) qclient.update_job_step(job_id, "Step 2 of 4: Generating per sample " "from demux (2/2)") out_dir = join(out_dir, 'deblured') cmd = generate_deblur_workflow_commands([split_out_dir], out_dir, parameters) else: qclient.update_job_step(job_id, "Step 2 of 4: Generating deblur " "command") cmd = generate_deblur_workflow_commands(fps['preprocessed_fastq'], out_dir, parameters) # Step 3 execute deblur qclient.update_job_step(job_id, "Step 3 of 4: Executing deblur job") std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error running deblur:\nStd out: %s\nStd err: %s" % (std_out, std_err)) return False, None, error_msg # Generating artifact pb = partial(join, out_dir) # Generate the filepaths final_biom = pb('all.biom') final_seqs = pb('all.seqs.fa') final_biom_hit = pb('reference-hit.biom') final_seqs_hit = pb('reference-hit.seqs.fa') if not exists(final_biom_hit): # Create an empty table. We need to send something to Qiita that is # a valid BIOM, so we are going to create an empty table t = Table([], [], []) with biom_open(final_biom_hit, 'w') as f: t.to_hdf5(f, 'qp-deblur generated') if not exists(final_seqs_hit): # Same as before, create an empty sequence file so we can send it with open(final_seqs_hit, 'w') as f: f.write("") # Step 4, communicate with archive to check and generate placements qclient.update_job_step(job_id, "Step 4 of 4 (1/4): Retrieving " "observations information") features = list(load_table(final_biom_hit).ids(axis='observation')) fp_phylogeny = None if features: observations = qclient.post( "/qiita_db/archive/observations/", data={'job_id': job_id, 'features': features}) novel_fragments = list(set(features) - set(observations.keys())) qclient.update_job_step(job_id, "Step 4 of 4 (2/4): Generating %d new " "placements" % len(novel_fragments)) # Once we support alternative reference phylogenies for SEPP in the # future, we need to translate the reference name here into # filepaths pointing to the correct reference alignment and # reference tree. If left 'None' the Greengenes 13.8 reference # shipped with the fragment-insertion conda package will be used. fp_reference_alignment = None fp_reference_phylogeny = None fp_reference_template = None fp_reference_rename = None if 'Reference phylogeny for SEPP' in parameters: if parameters['Reference phylogeny for SEPP'] == 'tiny': fp_reference_alignment = qp_deblur.get_data(join( 'sepp', 'reference_alignment_tiny.fasta')) fp_reference_phylogeny = qp_deblur.get_data(join( 'sepp', 'reference_phylogeny_tiny.nwk')) fp_reference_template = qp_deblur.get_data(join( 'sepp', 'tmpl_tiny_placement.json')) fp_reference_rename = qp_deblur.get_data(join( 'sepp', 'tmpl_tiny_rename-json.py')) try: new_placements = generate_sepp_placements( novel_fragments, out_dir, parameters['Threads per sample'], reference_alignment=fp_reference_alignment, reference_phylogeny=fp_reference_phylogeny) except ValueError as e: return False, None, str(e) qclient.update_job_step(job_id, "Step 4 of 4 (3/4): Archiving %d " "new placements" % len(novel_fragments)) # values needs to be json strings as well for fragment in new_placements.keys(): new_placements[fragment] = json.dumps(new_placements[fragment]) # fragments that get rejected by a SEPP run don't show up in # the placement file, however being rejected is a valuable # information and should be stored in the archive as well. # Thus, we avoid re-computation for rejected fragments in the # future. for fragment in novel_fragments: if fragment not in new_placements: new_placements[fragment] = "" if len(new_placements.keys()) > 0: qclient.patch(url="/qiita_db/archive/observations/", op="add", path=job_id, value=json.dumps(new_placements)) # retrieve all fragments and create actuall tree qclient.update_job_step(job_id, "Step 4 of 4 (4/4): Composing " "phylogenetic insertion tree") placements = qclient.post( "/qiita_db/archive/observations/", data={'job_id': job_id, 'features': features}) # remove fragments that have been rejected by SEPP, i.e. whoes # placement is the empty string and # convert all other placements from string to json placements = {frag: json.loads(placements[frag]) for frag, plc in placements.items() if plc != ''} try: fp_phylogeny = generate_insertion_trees( placements, out_dir, reference_template=fp_reference_template, reference_rename=fp_reference_rename) except ValueError as e: return False, None, str(e) else: new_placements = None ainfo = [ArtifactInfo('deblur final table', 'BIOM', [(final_biom, 'biom'), (final_seqs, 'preprocessed_fasta')])] if fp_phylogeny is not None: ainfo.append(ArtifactInfo('deblur reference hit table', 'BIOM', [(final_biom_hit, 'biom'), (final_seqs_hit, 'preprocessed_fasta'), (fp_phylogeny, 'plain_text')], new_placements)) return True, ainfo, ""
def deblur(qclient, job_id, parameters, out_dir): """Run deblur with the given parameters Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id parameters : dict The parameter values to run deblur out_dir : str The path to the job's output directory Returns ------- boolean, list, str The results of the job Notes ----- The code will check if the artifact has a preprocessed_demux element, if not it will use the preprocessed_fastq. We prefer to work with the preprocessed_demux as running time will be greatly improved """ out_dir = join(out_dir, 'deblur_out') # Step 1 get the rest of the information need to run deblur qclient.update_job_step(job_id, "Step 1 of 3: Collecting information") artifact_id = parameters['seqs-fp'] # removing input from parameters so it's not part of the final command del parameters['seqs-fp'] # Get the artifact filepath information artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) fps = artifact_info['files'] # Step 2 generating command deblur if 'preprocessed_demux' in fps: qclient.update_job_step( job_id, "Step 2 of 3: Generating per sample " "from demux (1/2)") if not exists(out_dir): mkdir(out_dir) split_out_dir = join(out_dir, 'split') if not exists(split_out_dir): mkdir(split_out_dir) # using the same number of parallel jobs as defined by the command n_jobs = parameters['jobs-to-start'] # [0] cause there should be only 1 file to_per_sample_files(fps['preprocessed_demux'][0], out_dir=split_out_dir, n_jobs=n_jobs) qclient.update_job_step( job_id, "Step 2 of 3: Generating per sample " "from demux (2/2)") out_dir = join(out_dir, 'deblured') cmd = generate_deblur_workflow_commands([split_out_dir], out_dir, parameters) else: qclient.update_job_step(job_id, "Step 2 of 3: Generating deblur " "command") cmd = generate_deblur_workflow_commands(fps['preprocessed_fastq'], out_dir, parameters) # Step 3 execute deblur qclient.update_job_step(job_id, "Step 3 of 3: Executing deblur job") std_out, std_err, return_value = system_call(cmd) if return_value != 0: error_msg = ("Error running deblur:\nStd out: %s\nStd err: %s" % (std_out, std_err)) return False, None, error_msg # Generating artifact pb = partial(join, out_dir) # Generate the filepaths final_biom = pb('final.biom') final_seqs = pb('final.seqs.fa') final_biom_16s = pb('final.only-16s.biom') final_seqs_na = pb('final.seqs.fa.no_artifacts') if not exists(final_biom_16s): # Create an empty table. We need to send something to Qiita that is # a valid BIOM, so we are going to create an empty table t = Table([], [], []) with biom_open(final_biom_16s, 'w') as f: t.to_hdf5(f, 'qp-deblur generated') if not exists(final_seqs_na): # Same as before, create an empty sequence file so we can send it with open(final_seqs_na, 'w') as f: f.write("") ainfo = [ ArtifactInfo('deblur final table', 'BIOM', [(final_biom, 'biom'), (final_seqs, 'preprocessed_fasta')]), ArtifactInfo('deblur 16S only table', 'BIOM', [(final_biom_16s, 'biom'), (final_seqs_na, 'preprocessed_fasta')]) ] return True, ainfo, ""
def persample(input, output, njobs): os.mkdir(output) to_per_sample_files(input, out_dir=output, n_jobs=njobs)