def test_to_ascii(self): with tempfile.NamedTemporaryFile('r+', suffix='.fq', delete=False) as f: f.write(fqdata) self.to_remove.append(f.name) to_hdf5(f.name, self.hdf5_file) exp = [b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\nABC\n", b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDFG\n", b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDEF\n"] obs = list(to_ascii(self.hdf5_file, samples=['a', 'b'])) self.assertEqual(obs, exp)
def test_to_ascii(self): with tempfile.NamedTemporaryFile('r+', suffix='.fq', delete=False) as f: f.write(fqdata) f.flush() f.close() to_hdf5(f.name, self.hdf5_file) self.to_remove.append(f.name) exp = [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\nABC\n"), (b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDFG\n"), (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\nDEF\n")] obs = list(to_ascii(self.hdf5_file, samples=['a', 'b'])) self.assertEqual(obs, exp)
def test_to_ascii_fasta(self): with tempfile.NamedTemporaryFile('r+', suffix='.fna', delete=False) as f: f.write(seqdata) self.to_remove.append(f.name) to_hdf5(f.name, self.hdf5_file) exp = [b">a_0 orig_bc=abc new_bc=abc bc_diffs=0\nx\n", b">a_1 orig_bc=aby new_bc=ybc bc_diffs=2\nxy\n", b">a_2 orig_bc=abz new_bc=zbc bc_diffs=3\nxyz\n", b">b_0 orig_bc=abx new_bc=xbc bc_diffs=1\nxyz\n", b">b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nabcd\n"] obs = list(to_ascii(self.hdf5_file, samples=['a', 'b'])) self.assertEqual(obs, exp)
def test_to_ascii_fasta(self): with tempfile.NamedTemporaryFile('r+', suffix='.fna', delete=False) as f: f.write(seqdata) self.to_remove.append(f.name) to_hdf5(f.name, self.hdf5_file) exp = [ b">a_0 orig_bc=abc new_bc=abc bc_diffs=0\nx\n", b">a_1 orig_bc=aby new_bc=ybc bc_diffs=2\nxy\n", b">a_2 orig_bc=abz new_bc=zbc bc_diffs=3\nxyz\n", b">b_0 orig_bc=abx new_bc=xbc bc_diffs=1\nxyz\n", b">b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nabcd\n" ] obs = list(to_ascii(self.hdf5_file, samples=['a', 'b'])) self.assertEqual(obs, exp)
def test_to_ascii(self): with tempfile.NamedTemporaryFile('r+', suffix='.fq', delete=False) as f: f.write(fqdata) f.flush() f.close() to_hdf5(f.name, self.hdf5_file) self.to_remove.append(f.name) exp = [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n" "A\x00\x00\x00\x00\x00\x00\x00" "B\x00\x00\x00\x00\x00\x00\x00" "C\x00\x00\x00\x00\x00\x00\x00\n"), (b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n" "D\x00\x00\x00\x00\x00\x00\x00" "F\x00\x00\x00\x00\x00\x00\x00" "G\x00\x00\x00\x00\x00\x00\x00\n"), (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n" "D\x00\x00\x00\x00\x00\x00\x00" "E\x00\x00\x00\x00\x00\x00\x00" "F\x00\x00\x00\x00\x00\x00\x00\n")] obs = list(to_ascii(self.hdf5_file, samples=['a', 'b'])) self.assertEqual(obs, exp)
def _validate_demux_file(qclient, job_id, prep_info, out_dir, demux_fp, fastq_fp=None, fasta_fp=None, log_fp=None): """Validate and fix a 'demux' file and regenerate fastq and fasta files Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id prep_info : dict of {str: dict of {str: str}} The prep information keyed by sample id out_dir : str The output directory demux_fp : str The demux file path fastq_fp : str, optional The original fastq filepath. If demux is correct, it will not be regenerated fasta_fp : str, optional The original fasta filepath. If demux is correct, it will no be regenerated log_fp : str, optional The original log filepath Returns ------- dict The results og the job """ pt_sample_ids = set(prep_info) with File(demux_fp) as f: demux_sample_ids = set(f.keys()) if not pt_sample_ids.issuperset(demux_sample_ids): # The demux sample ids are different from the ones in the prep template qclient.update_job_step(job_id, "Step 3: Fixing sample ids") # Atempt 1: the user provided the run prefix column - in this case the # run prefix column holds the sample ids present in the demux file if 'run_prefix' in prep_info[next(iter(pt_sample_ids))]: id_map = {v['run_prefix']: k for k, v in prep_info.items()} if not set(id_map).issuperset(demux_sample_ids): return format_payload( success=False, error_msg='The sample ids in the "run_prefix" columns ' 'from the prep information do not match the ' 'ones in the demux file. Please, correct the ' 'column "run_prefix" in the prep information to ' 'map the existing sample ids to the prep ' 'information sample ids.') else: # Attempt 2: the sample ids in the demux table are the same that # in the prep template but without the prefix prefix = next(iter(pt_sample_ids)).split('.', 1)[0] prefixed = set("%s.%s" % (prefix, s) for s in demux_sample_ids) if pt_sample_ids.issuperset(prefixed): id_map = {s: "%s.%s" % (prefix, s) for s in demux_sample_ids} else: # There is nothing we can do. The samples in the demux file do # not match the ones in the prep template and we can't fix it return format_payload( success=False, error_msg='The sample ids in the demultiplexed files do ' 'not match the ones in the prep information. ' 'Please, provide the column "run_prefix" in ' 'the prep information to map the existing sample' ' ids to the prep information sample ids.') # Fix the sample ids # Do not modify the original demux file, copy it to a new location new_demux_fp = join(out_dir, basename(demux_fp)) copy(demux_fp, new_demux_fp) # Need to catch an error with File(new_demux_fp, 'r+') as f: for old in f: f.move(old, id_map[old]) # When we fix, we always generate the FASTQ and FASTA file # By setting them to None, below will be generated demux_fp = new_demux_fp fastq_fp = None fasta_fp = None # If we didn't fix anything, we only generate the files if they don't # already exists name = splitext(basename(demux_fp))[0] if not fastq_fp: fastq_fp = join(out_dir, "%s.fastq" % name) with open(fastq_fp, 'w') as fq: with File(demux_fp, 'r') as dx: for record in to_ascii(dx): fq.write(record) if not fasta_fp: fasta_fp = join(out_dir, "%s.fasta" % name) with open(fasta_fp, 'w') as f: for r in load(fastq_fp): f.write(format_fasta_record(r['SequenceID'], r['Sequence'], r['Qual'])) filepaths = [[[fastq_fp], 'preprocessed_fastq'], [[fasta_fp], 'preprocessed_fasta'], [[demux_fp], 'preprocessed_demux']] if log_fp: filepaths.append([[log_fp], 'log']) return format_payload( success=True, artifacts_info=[[None, 'Demultiplexed', filepaths]])
def _validate_demux_file(qclient, job_id, prep_info, out_dir, demux_fp, fastq_fp=None, fasta_fp=None, log_fp=None): """Validate and fix a 'demux' file and regenerate fastq and fasta files Parameters ---------- qclient : qiita_client.QiitaClient The Qiita server client job_id : str The job id prep_info : dict of {str: dict of {str: str}} The prep information keyed by sample id out_dir : str The output directory demux_fp : str The demux file path fastq_fp : str, optional The original fastq filepath. If demux is correct, it will not be regenerated fasta_fp : str, optional The original fasta filepath. If demux is correct, it will no be regenerated log_fp : str, optional The original log filepath Returns ------- dict The results og the job """ pt_sample_ids = set(prep_info) with File(demux_fp) as f: demux_sample_ids = set(f.keys()) if not pt_sample_ids.issuperset(demux_sample_ids): # The demux sample ids are different from the ones in the prep template qclient.update_job_step(job_id, "Step 3: Fixing sample ids") # Atempt 1: the user provided the run prefix column - in this case the # run prefix column holds the sample ids present in the demux file if 'run_prefix' in prep_info[next(iter(pt_sample_ids))]: id_map = {v['run_prefix']: k for k, v in prep_info.items()} if not set(id_map).issuperset(demux_sample_ids): error_msg = ('The sample ids in the "run_prefix" columns ' 'from the prep information do not match the ' 'ones in the demux file. Please, correct the ' 'column "run_prefix" in the prep information to ' 'map the existing sample ids to the prep ' 'information sample ids.') return False, None, error_msg else: # Attempt 2: the sample ids in the demux table are the same that # in the prep template but without the prefix prefix = next(iter(pt_sample_ids)).split('.', 1)[0] prefixed = set("%s.%s" % (prefix, s) for s in demux_sample_ids) if pt_sample_ids.issuperset(prefixed): id_map = {s: "%s.%s" % (prefix, s) for s in demux_sample_ids} else: # There is nothing we can do. The samples in the demux file do # not match the ones in the prep template and we can't fix it error_msg = ('The sample ids in the demultiplexed files do ' 'not match the ones in the prep information. ' 'Please, provide the column "run_prefix" in ' 'the prep information to map the existing sample' ' ids to the prep information sample ids.') return False, None, error_msg # Fix the sample ids # Do not modify the original demux file, copy it to a new location new_demux_fp = join(out_dir, basename(demux_fp)) copy(demux_fp, new_demux_fp) # Need to catch an error with File(new_demux_fp, 'r+') as f: for old in f: f.move(old, id_map[old]) # When we fix, we always generate the FASTQ and FASTA file # By setting them to None, below will be generated demux_fp = new_demux_fp fastq_fp = None fasta_fp = None # If we didn't fix anything, we only generate the files if they don't # already exists name = splitext(basename(demux_fp))[0] if not fastq_fp: fastq_fp = join(out_dir, "%s.fastq" % name) with open(fastq_fp, 'w') as fq: with File(demux_fp, 'r') as dx: for record in to_ascii(dx): fq.write(record) if not fasta_fp: fasta_fp = join(out_dir, "%s.fasta" % name) with open(fasta_fp, 'w') as f: for r in load(fastq_fp): f.write(format_fasta_record(r['SequenceID'], r['Sequence'], r['Qual'])) filepaths = [[[fastq_fp], 'preprocessed_fastq'], [[fasta_fp], 'preprocessed_fasta'], [[demux_fp], 'preprocessed_demux']] if log_fp: filepaths.append([[log_fp], 'log']) return True, [[None, 'Demultiplexed', filepaths]], ""