def test_illuminafastq(self): fastq_file = StringIO( u"@M03543:47:C8LJ2ANXX:1:2209:1084:2044 1:N:0:NNNNNNNN+NNNNNNNN") fastq_filepath = ( "Miseq/160511_M03543_0047_000000000-APE6Y/Data/Intensities/" "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") folder_info = {"date":"160511", "instrument":"M03543", "run_number":"47", "flowcell_id":"000000000-APE6Y", "lane":"1", "read_or_index":"R", "read":"1"} fastq_file.name = fastq_filepath fq = IlluminaFastq(fastq_file) self.assertEqual(fq.machine_type, "Illumina-MiSeq") self.assertEqual(fq.date, "2016-05-11") self.assertEqual(fq.lane, "1") self.assertEqual(fq.filepath, fastq_filepath) self.assertEqual(fq.run_name, "160511_M03543_0047_000000000-APE6Y") self.assertDictEqual(fq.folder_info, folder_info) fastq_file = StringIO( u"@D00727:27:CA7HHANXX:1:1105:1243:1992 1:N:0:NGATCAGT+NNAAGGAG") fastq_filepath = ( "Hiseq/170330_D00727_0027_ACA7HHANXX/Data/Intensities/" "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") folder_info = {"date":"170330", "instrument":"D00727", "run_number":"27", "flowcell_id":"CA7HHANXX", "lane":"1", "read_or_index":"R", "read":"1"} fastq_file.name = fastq_filepath fq = IlluminaFastq(fastq_file) self.assertEqual(fq.machine_type, "Illumina-HiSeq") self.assertEqual(fq.date, "2017-03-30") self.assertEqual(fq.lane, "1") self.assertEqual(fq.filepath, fastq_filepath) self.assertEqual(fq.run_name, "170330_D00727_0027_ACA7HHANXX") self.assertDictEqual(fq.folder_info, folder_info)
def test_check_file_size(self): curr_dir = os.path.dirname(os.path.abspath(__file__)) fastq_filepath = os.path.join( curr_dir, "170323_M04734_0028_000000000-B2MVT/Undetermined_S0_L001_R1_001.fastq.gz" ) fq = IlluminaFastq(gzip.open(fastq_filepath, mode='rt')) self.assertTrue(fq.check_file_size(50)) self.assertFalse(fq.check_file_size(50000))
def test_backup_fastq(self): has_index = True min_file_size = 5 backup_fastq(self.fastq_filepath, self.temp_out_dir, self.sample_sheet_fp, has_index, min_file_size) # check the md5sums of the first fastq is the same fq = IlluminaFastq(gzip.GzipFile(self.fastq_filepath)) out_fp = os.path.join(self.temp_out_dir, fq.build_archive_dir(), os.path.basename(self.fastq_filepath)) md5_orj = return_md5(self.fastq_filepath) md5_trans = return_md5(out_fp) self.assertEqual(md5_orj, md5_trans) # check the md5sum of the sample sheet ss_fp = os.path.join(self.temp_out_dir, fq.build_archive_dir(), os.path.basename(self.sample_sheet_fp)) self.assertEqual(return_md5(ss_fp), "92ef1ca7433cadb5267d822615cd15e2") # check write permissions of the files self.assertEqual(os.stat(out_fp).st_mode, 33060)
def test_build_archive_dir(self): # for MiSeq fastq_file = StringIO( u"@M03543:47:C8LJ2ANXX:1:2209:1084:2044 1:N:0:NNNNNNNN+NNNNNNNN") fastq_filepath = ( "Miseq/160511_M03543_0047_000000000-APE6Y/Data/Intensities/" "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") fastq_file.name = fastq_filepath fq = IlluminaFastq(fastq_file) self.assertEqual(fq.build_archive_dir(), "160511_M03543_0047_000000000-APE6Y_L001") # for HiSeq fastq_file = StringIO( u"@D00727:27:CA7HHANXX:1:1105:1243:1992 1:N:0:NGATCAGT+NNAAGGAG") fastq_filepath = ( "Hiseq/170330_D00727_0027_ACA7HHANXX/Data/Intensities/" "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") fastq_file.name = fastq_filepath fq = IlluminaFastq(fastq_file) self.assertEqual(fq.build_archive_dir(), "170330_D00727_0027_ACA7HHANXX_L001")
def test_check_index_read_exists(self): # test passing fastq_file = StringIO( u"@M04734:28:000000000-B2MVT:1:2106:17605:1940 1:N:0:TTTTTTTTTTTT+TCTTTCCCTACA") fastq_filepath = ( "Miseq/170323_M04734_0028_000000000-B2MVT/Data/Intensities/" "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") fastq_file.name = fastq_filepath fq = IlluminaFastq(fastq_file) self.assertTrue(fq.check_index_read_exists()) # test failing fastq_file = StringIO( u"@M04734:28:000000000-B2MVT:1:2106:17605:1940 1:N:0:0") fastq_filepath = ( "Miseq/170323_M04734_0028_000000000-B2MVT/Data/Intensities/" "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") fastq_file.name = fastq_filepath fq = IlluminaFastq(fastq_file) self.assertFalse(fq.check_index_read_exists())
def test_is_same_run(self): fastq_file = StringIO( u"@M03543:47:C8LJ2ANXX:1:2209:1084:2044 1:N:0:NNNNNNNN+NNNNNNNN") fastq_filepath = ( "Miseq/160511_M03543_0047_000000000-APE6Y/Data/Intensities/" "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") fastq_file.name = fastq_filepath fq = IlluminaFastq(fastq_file) fastq_file.seek(0) fq1 = IlluminaFastq(fastq_file) fastq_file = StringIO( u"@D00727:27:CA7HHANXX:1:1105:1243:1992 1:N:0:NGATCAGT+NNAAGGAG") fastq_filepath = ( "Hiseq/170330_D00727_0027_ACA7HHANXX/Data/Intensities/" "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") fastq_file.name = fastq_filepath fq2 = IlluminaFastq(fastq_file) self.assertTrue(fq.is_same_run(fq1)) self.assertFalse(fq.is_same_run(fq2))
def test_fp_vs_content(self): # check correct case for Miseq data fastq_file = StringIO( u"@M04734:28:000000000-B2MVT:1:2106:17605:1940 1:N:0:TTTTTTTTTTTT+TCTTTCCCTACA") fastq_filepath = ( "Miseq/170323_M04734_0028_000000000-B2MVT/Data/Intensities/" "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") fastq_file.name = fastq_filepath fq = IlluminaFastq(fastq_file) self.assertTrue(fq.check_fp_vs_content()) # check correct case for Hiseq data fastq_file = StringIO( u"@D00727:27:CA7HHANXX:1:1105:1243:1992 1:N:0:NGATCAGT+NNAAGGAG") fastq_filepath = ( "Hiseq/170330_D00727_0027_ACA7HHANXX/Data/Intensities/" "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") fastq_file.name = fastq_filepath fq = IlluminaFastq(fastq_file) self.assertTrue(fq.check_fp_vs_content()) # case when the lane number doesn't match fastq_file = StringIO( u"@M04734:28:000000000-B2MVT:3:2106:17605:1940 1:N:0:TTTTTTTTTTTT+TCTTTCCCTACA") fastq_filepath = ( "Miseq/170323_M04734_0028_000000000-B2MVT/Data/Intensities/" "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") fastq_file.name = fastq_filepath fq = IlluminaFastq(fastq_file) self.assertFalse(fq.check_fp_vs_content()) # case when the flow cell ID doesn't match fastq_file = StringIO( u"@M04734:28:000000000-BBBBB:1:2106:17605:1940 1:N:0:TTTTTTTTTTTT+TCTTTCCCTACA") fastq_filepath = ( "Miseq/170323_M04734_0028_000000000-B2MVT/Data/Intensities/" "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") fastq_file.name = fastq_filepath fq = IlluminaFastq(fastq_file) self.assertFalse(fq.check_fp_vs_content()) # case when the machine doesn't match fastq_file = StringIO( u"@D04734:28:000000000-BBBBB:1:2106:17605:1940 1:N:0:TTTTTTTTTTTT+TCTTTCCCTACA") fastq_filepath = ( "Miseq/170323_M04734_0028_000000000-B2MVT/Data/Intensities/" "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz") fastq_file.name = fastq_filepath fq = IlluminaFastq(fastq_file) self.assertFalse(fq.check_fp_vs_content()) # case when the read doesn't match ### important: It won't distinguish between R1 and I1. fastq_file = StringIO( u"@M04734:28:000000000-B2MVT:1:2106:17605:1940 1:N:0:TTTTTTTTTTTT+TCTTTCCCTACA") fastq_filepath = ( "Miseq/170323_M04734_0028_000000000-B2MVT/Data/Intensities/" "BaseCalls/Undetermined_S0_L001_R2_001.fastq.gz") fastq_file.name = fastq_filepath fq = IlluminaFastq(fastq_file) self.assertFalse(fq.check_fp_vs_content())
def backup_fastq(forward_reads, dest_dir, sample_sheet_fp, has_index, min_file_size): R1 = IlluminaFastq(gzip.GzipFile(forward_reads)) # build the strings for the required files file_names_RI = build_fp_to_archive(forward_reads, has_index, R1.lane) # create the Illumina objects and check the files illumina_fastqs = [] for fp in file_names_RI: illumina_temp = IlluminaFastq(gzip.GzipFile(fp)) if not illumina_temp.check_fp_vs_content(): raise ValueError( "The file path and header infromation don't match") if not illumina_temp.check_file_size(min_file_size): raise ValueError( "File {0} seems suspiciously small. Plese check if you have the correct file or lower the minimum file size threshold" .format(fp)) if not illumina_temp.check_index_read_exists(): warnings.warn( "No barcodes in headers. Were the fastq files generated properly?: {0}" .format(fp)) illumina_fastqs.append(illumina_temp) # parse the info from the headers in EACH file and check they are consistent within each other if not all( [fastq.is_same_run(illumina_fastqs[0]) for fastq in illumina_fastqs]): raise ValueError("The files are not from the same run.") ## Archiving steps # make sure the sample sheet exists if not os.path.isfile(sample_sheet_fp): raise IOError( "Sample sheet does not exist: {}".format(sample_sheet_fp)) # create the folder to write to write_dir = os.path.join(dest_dir, illumina_temp.build_archive_dir()) # create the folder. If it exists exit if os.path.isdir(write_dir): raise IOError("The folder already exists: {}".format(write_dir)) os.mkdir(write_dir) ### All the checks are done and the files are safe to archive! # move the files to the archive location and remove permission permission = stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH for fp in file_names_RI: shutil.copyfile(fp, os.path.join(write_dir, os.path.basename(fp))) os.chmod(os.path.join(write_dir, os.path.basename(fp)), permission) #this doesn't work on isilon # copy the sample sheet to destination folder shutil.copyfile(sample_sheet_fp, os.path.join(write_dir, os.path.basename(sample_sheet_fp))) # write md5sums to a file md5s = [(os.path.basename(fp), return_md5(fp)) for fp in file_names_RI] md5out_fp = os.path.join( write_dir, ".".join([illumina_temp.build_archive_dir(), "md5"])) with open(md5out_fp, "w") as md5_out: [md5_out.write("\t".join(md5) + "\n") for md5 in md5s]