def test_view_artifcat_type(): testFile = os.path.join(TEST_DIR, "test_data", "paired", "445cf54a-bf06-4852-8010-13a60fa1598c", "data") testData = SingleLanePerSamplePairedEndFastqDirFmt(testFile, "r") os.chdir(str(testData)) exp1 = itsxq._view_artifact_type() if not ("SampleData[PairedEndSequencesWithQuality]" in exp1): raise AssertionError() testFile2 = os.path.join(TEST_DIR, "test_data", "pairedbroken", "50d5f31a-a761-4c04-990c-e7668fe6bf00", "data") testData2 = SingleLanePerSamplePairedEndFastqDirFmt(testFile2, "r") os.chdir(str(testData2)) assert_raises(ValueError, exp2=itsxq._view_artifact_type())
def setUp(self): super().setUp() data_single = SingleLanePerSampleSingleEndFastqDirFmt( self.get_data_path('filter_samples_single_end/dir_fmt'), mode='r') self.sample_single = _PlotQualView(data_single, False) self.manifest_single = data_single.manifest.view(pd.DataFrame) self.md_single_all = Metadata.load( self.get_data_path('filter_samples_single_end/filter_all.tsv')) self.md_single_subset = Metadata.load( self.get_data_path('filter_samples_single_end/filter_subset.tsv')) self.md_single_none = Metadata.load( self.get_data_path('filter_samples_single_end/filter_none.tsv')) data_paired = SingleLanePerSamplePairedEndFastqDirFmt( self.get_data_path('filter_samples_paired_end/dir_fmt'), mode='r') self.sample_paired = _PlotQualView(data_paired, True) self.manifest_paired = data_paired.manifest.view(pd.DataFrame) self.md_paired_all = Metadata.load( self.get_data_path('filter_samples_single_end/filter_all.tsv')) self.md_paired_subset = Metadata.load( self.get_data_path('filter_samples_single_end/filter_subset.tsv')) self.md_paired_none = Metadata.load( self.get_data_path('filter_samples_single_end/filter_none.tsv'))
def test_slanepsample_paired_end_fastq_dir_fmt_validate_missing_pair(self): filenames = ('single_end_data/MANIFEST', 'metadata.yml', 'Human-Kneecap_S1_L001_R1_001.fastq.gz') for filename in filenames: filepath = self.get_data_path(filename) shutil.copy(filepath, self.temp_dir.name) format = SingleLanePerSamplePairedEndFastqDirFmt(self.temp_dir.name, mode='r') with self.assertRaisesRegex(ValidationError, 'paired'): format.validate()
def test_slanepsample_paired_end_fastq_dir_fmt_validate_positive(self): filenames = ('paired_end_data/MANIFEST', 'metadata.yml', 'Human-Kneecap_S1_L001_R1_001.fastq.gz', 'paired_end_data/Human-Kneecap_S1_L001_R2_001.fastq.gz') for filename in filenames: filepath = self.get_data_path(filename) shutil.copy(filepath, self.temp_dir.name) format = SingleLanePerSamplePairedEndFastqDirFmt(self.temp_dir.name, mode='r') format.validate()
def test_slanepsample_paired_end_fastq_dir_fmt_validate_negative(self): filenames = ('paired_end_data/MANIFEST', 'metadata.yml', 'not-fastq.fastq.gz') for filename in filenames: filepath = self.get_data_path(filename) shutil.copy(filepath, self.temp_dir.name) format = SingleLanePerSamplePairedEndFastqDirFmt(self.temp_dir.name, mode='r') with self.assertRaisesRegex(ValueError, 'SingleLanePerSamplePaired'): format.validate()
def test_fastq_id_maker(): testFile = os.path.join(TEST_DIR, "test_data", "paired", "445cf54a-bf06-4852-8010-13a60fa1598c", "data") testData = SingleLanePerSamplePairedEndFastqDirFmt(testFile, "r") artifactType = "SampleData[PairedEndSequencesWithQuality]" exp1, exp2 = itsxq._fastq_id_maker(testData, artifactType) expList = [] exp1Set = set(exp1) for sequence in exp1Set: expList.append(sequence[0]) expList.append(sequence[1]) if not expList == [ '4774-1-MSITS3_0_L001_R1_001.fastq.gz', '4774-1-MSITS3_1_L001_R2_001.fastq.gz' ]: raise AssertionError() if exp2 != False: raise AssertionError()
def setUp(self): super().setUp() self.input_seqs = SingleLanePerSamplePairedEndFastqDirFmt( self.get_data_path('demux-1'), 'r')
def setUp(self): super().setUp() self.demux_seqs = SingleLanePerSamplePairedEndFastqDirFmt( self.get_data_path('sample_seqs_paired'), 'r')
def emp_paired( seqs: BarcodePairedSequenceFastqIterator, barcodes: qiime2.MetadataCategory, rev_comp_barcodes: bool = False, rev_comp_mapping_barcodes: bool = False ) -> SingleLanePerSamplePairedEndFastqDirFmt: result = SingleLanePerSamplePairedEndFastqDirFmt() barcode_map, barcode_len = _make_barcode_map(barcodes, rev_comp_mapping_barcodes) manifest = FastqManifestFormat() manifest_fh = manifest.open() manifest_fh.write('sample-id,filename,direction\n') per_sample_fastqs = {} for barcode_record, forward_record, reverse_record in seqs: barcode_read = barcode_record[1] if rev_comp_barcodes: barcode_read = str(skbio.DNA(barcode_read).reverse_complement()) barcode_read = barcode_read[:barcode_len] try: sample_id = barcode_map[barcode_read] except KeyError: # TODO: this should ultimately be logged, but we don't currently # have support for that. continue if sample_id not in per_sample_fastqs: barcode_id = len(per_sample_fastqs) + 1 fwd_path = result.sequences.path_maker(sample_id=sample_id, barcode_id=barcode_id, lane_number=1, read_number=1) rev_path = result.sequences.path_maker(sample_id=sample_id, barcode_id=barcode_id, lane_number=1, read_number=2) _maintain_open_fh_count(per_sample_fastqs, paired=True) per_sample_fastqs[sample_id] = (gzip.open(str(fwd_path), mode='a'), gzip.open(str(rev_path), mode='a')) manifest_fh.write('%s,%s,%s\n' % (sample_id, fwd_path.name, 'forward')) manifest_fh.write('%s,%s,%s\n' % (sample_id, rev_path.name, 'reverse')) if per_sample_fastqs[sample_id][0].closed: _maintain_open_fh_count(per_sample_fastqs, paired=True) fwd, rev = per_sample_fastqs[sample_id] per_sample_fastqs[sample_id] = (gzip.open(fwd.name, mode='a'), gzip.open(rev.name, mode='a')) fwd, rev = per_sample_fastqs[sample_id] fwd.write(('\n'.join(forward_record) + '\n').encode('utf-8')) rev.write(('\n'.join(reverse_record) + '\n').encode('utf-8')) if len(per_sample_fastqs) == 0: raise ValueError('No sequences were mapped to samples. Check that ' 'your barcodes are in the correct orientation (see ' 'the rev_comp_barcodes and/or ' 'rev_comp_mapping_barcodes options).') for fwd, rev in per_sample_fastqs.values(): fwd.close() rev.close() manifest_fh.close() result.manifest.write_data(manifest, FastqManifestFormat) _write_metadata_yaml(result) return result
SingleLanePerSampleSingleEndFastqDirFmt, SingleLanePerSamplePairedEndFastqDirFmt, FastqManifestFormat) import itsxpress._itsxpress as _itsxpress import qiime2 from qiime2.util import redirected_stdio import pandas as pd # The test data dir TEST_DIR = os.path.dirname(os.path.abspath(__file__)) # Test info 1 TEST_FILE = os.path.join(TEST_DIR, "test_data", "paired", "445cf54a-bf06-4852-8010-13a60fa1598c", "data") TEST_DATA = SingleLanePerSamplePairedEndFastqDirFmt(TEST_FILE, "r") # Test info 2 TEST_FILE_PBMD = os.path.join(TEST_DIR, "test_data", "pairedBrokenMissingData", "50d5f31a-a761-4c04-990c-e7668fe6bf00", "data") TEST_DATA_PBMD = SingleLanePerSamplePairedEndFastqDirFmt(TEST_FILE_PBMD, "r") # Test info 3 TEST_FILE_PAF = os.path.join(TEST_DIR, "test_data", "pairedAllForward", "445cf54a-bf06-4852-8010-13a60fa1598c", "data") TEST_DATA_PAF = SingleLanePerSamplePairedEndFastqDirFmt(TEST_FILE_PAF, "r") # Test info 4 TEST_FILE_OUT = os.path.join(TEST_DIR, "test_data", "out", "d9955749-00d5-44ae-a628-4b2da43000e1", "data") TEST_DATA_OUT = SingleLanePerSamplePairedEndFastqDirFmt(TEST_FILE_OUT, "r") # Test info 5 TEST_FILE_SINGLEOUT = os.path.join(TEST_DIR, "test_data", "singleOut",
def emp_paired( seqs: BarcodePairedSequenceFastqIterator, barcodes: qiime2.CategoricalMetadataColumn, golay_error_correction: bool = True, rev_comp_barcodes: bool = False, rev_comp_mapping_barcodes: bool = False, ignore_description_mismatch: bool = False ) -> (SingleLanePerSamplePairedEndFastqDirFmt, ErrorCorrectionDetailsFmt): seqs.ignore_description_mismatch = ignore_description_mismatch result = SingleLanePerSamplePairedEndFastqDirFmt() barcode_map, barcode_len = _make_barcode_map(barcodes, rev_comp_mapping_barcodes) if golay_error_correction: decoder = GolayDecoder() manifest = FastqManifestFormat() manifest_fh = manifest.open() manifest_fh.write('sample-id,filename,direction\n') per_sample_fastqs = {} ec_details_fmt = ErrorCorrectionDetailsFmt() ec_details = ECDetails(ec_details_fmt) for i, record in enumerate(seqs, start=1): barcode_record, forward_record, reverse_record = record barcode_read = barcode_record[1] if rev_comp_barcodes: barcode_read = str(skbio.DNA(barcode_read).reverse_complement()) raw_barcode_read = barcode_read[:barcode_len] if golay_error_correction: # A three bit filter is implicitly used by the decoder. See Hamady # and Knight 2009 Genome Research for the justification: # # https://genome.cshlp.org/content/19/7/1141.full # # Specifically that "...Golay codes of 12 bases can correct all # triple-bit errors and detect all quadruple-bit errors." barcode_read, ecc_errors = decoder.decode(raw_barcode_read) golay_stats = [barcode_read, ecc_errors] else: barcode_read = raw_barcode_read golay_stats = [None, None] sample_id = barcode_map.get(barcode_read) record = [ f'record-{i}', sample_id, barcode_record[0], raw_barcode_read, ] ec_details.write(record + golay_stats) if sample_id is None: continue if sample_id not in per_sample_fastqs: barcode_id = len(per_sample_fastqs) + 1 fwd_path = result.sequences.path_maker(sample_id=sample_id, barcode_id=barcode_id, lane_number=1, read_number=1) rev_path = result.sequences.path_maker(sample_id=sample_id, barcode_id=barcode_id, lane_number=1, read_number=2) _maintain_open_fh_count(per_sample_fastqs, paired=True) per_sample_fastqs[sample_id] = (gzip.open(str(fwd_path), mode='a'), gzip.open(str(rev_path), mode='a')) manifest_fh.write('%s,%s,%s\n' % (sample_id, fwd_path.name, 'forward')) manifest_fh.write('%s,%s,%s\n' % (sample_id, rev_path.name, 'reverse')) if per_sample_fastqs[sample_id][0].closed: _maintain_open_fh_count(per_sample_fastqs, paired=True) fwd, rev = per_sample_fastqs[sample_id] per_sample_fastqs[sample_id] = (gzip.open(fwd.name, mode='a'), gzip.open(rev.name, mode='a')) fwd, rev = per_sample_fastqs[sample_id] fwd.write(('\n'.join(forward_record) + '\n').encode('utf-8')) rev.write(('\n'.join(reverse_record) + '\n').encode('utf-8')) if len(per_sample_fastqs) == 0: raise ValueError('No sequences were mapped to samples. Check that ' 'your barcodes are in the correct orientation (see ' 'the rev_comp_barcodes and/or ' 'rev_comp_mapping_barcodes options). If barcodes are ' 'NOT Golay format set golay_error_correction ' 'to False.') for fwd, rev in per_sample_fastqs.values(): fwd.close() rev.close() manifest_fh.close() result.manifest.write_data(manifest, FastqManifestFormat) _write_metadata_yaml(result) return result, ec_details_fmt
def main(per_sample_sequences: _SingleLanePerSampleFastqDirFmt, threads: int, taxa: str, region: str, paired: bool, cluster_id: float): """The main communication between the plugin and the ITSxpress program. Args: per_sample_sequences (SingleLanePerSampleSingleEndFastqDirFmt): The SingleLanePerSampleSingleEndFastqDirFmt type of the input. threads (int) : The number of threads to use. taxa (str): The taxa to be used for the search. region (str) : The region to be used for the search. cluster_id (float):The percent identity for clustering reads, set to 1 for exact dereplication. Returns: (SingleLanePerSampleSingleEndFastqDirFmt): The SingleLanePerSampleSingleEndFastqDirFmt type of the output. Raises: ValueError1: hmmsearch error. """ #Seeing if cluter_id is equal to 1 # Finding the artifact type. artifact_type = _view_artifact_type( per_sample_sequence=per_sample_sequences) # Setting the taxa taxa = _taxa_prefix_to_taxa(taxa) # Writing the manifest for the output qza manifest = FastqManifestFormat() manifest_fn = manifest.open() manifest_fn.write('sample-id,filename,direction\n') # Getting the sequences from the manifest sequences, single_end = _fastq_id_maker( per_sample_sequences=per_sample_sequences, artifact_type=artifact_type) barcode = 0 # Creating result dir if paired: results = SingleLanePerSamplePairedEndFastqDirFmt() else: results = SingleLanePerSampleSingleEndFastqDirFmt() # Running the for loop for each sample for sequence in sequences: # writing fastqs and there attributes and checking the files sequence_id, sobj = _set_fastqs_and_check( per_sample_sequences=per_sample_sequences, artifact_type=artifact_type, sequence=sequence, single_end=single_end, threads=threads) # Deduplicate if math.isclose(cluster_id, 1, rel_tol=1e-05): sobj.deduplicate(threads=threads) else: sobj.cluster(threads=threads, cluster_id=cluster_id) try: # HMMSearch for ITS regions hmmfile = os.path.join(ROOT_DIR, "ITSx_db", "HMMs", taxa_dict[taxa]) sobj._search(hmmfile=hmmfile, threads=threads) except (ModuleNotFoundError, FileNotFoundError, NotADirectoryError): raise ValueError( "hmmsearch was not found, make sure HMMER3 is installed and executable" ) # Parse HMMseach output. its_pos = itsxpress.ItsPosition(domtable=sobj.dom_file, region=region) # Create deduplication object. dedup_obj = itsxpress.Dedup(uc_file=sobj.uc_file, rep_file=sobj.rep_file, seq_file=sobj.seq_file, fastq=sobj.r1, fastq2=sobj.fastq2) path_forward = results.sequences.path_maker(sample_id=sequence_id, barcode_id=barcode, lane_number=1, read_number=1) path_reverse = results.sequences.path_maker(sample_id=sequence_id, barcode_id=barcode, lane_number=1, read_number=2) manifest_fn.write("{},{},forward\n".format(sequence_id, path_forward.name)) # Create trimmed sequences. if paired: dedup_obj.create_paired_trimmed_seqs(str(path_forward), str(path_reverse), gzipped=True, itspos=its_pos) else: dedup_obj.create_trimmed_seqs(str(path_forward), gzipped=True, itspos=its_pos) # Deleting the temp files. shutil.rmtree(sobj.tempdir) # Adding one to the barcode barcode += 1 # Writing out the results. manifest_fn.close() _write_metadata(results=results) results.manifest.write_data(manifest, FastqManifestFormat) return results
def setUp(self): super().setUp() self.demux_seqs = SingleLanePerSamplePairedEndFastqDirFmt( self.get_data_path('paired-end'), mode='r') self.trimmed_seqs = CasavaOneEightSingleLanePerSampleDirFmt()