Ejemplo n.º 1
0
    def test_illuminafastq(self):
        fastq_file = StringIO(
            u"@M03543:47:C8LJ2ANXX:1:2209:1084:2044 1:N:0:NNNNNNNN+NNNNNNNN")
        fastq_filepath = (
            "Miseq/160511_M03543_0047_000000000-APE6Y/Data/Intensities/"
            "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz")
        folder_info = {"date":"160511", "instrument":"M03543", "run_number":"47", "flowcell_id":"000000000-APE6Y", "lane":"1", "read_or_index":"R", "read":"1"}
        fastq_file.name = fastq_filepath
        fq = IlluminaFastq(fastq_file)

        self.assertEqual(fq.machine_type, "Illumina-MiSeq")
        self.assertEqual(fq.date, "2016-05-11")
        self.assertEqual(fq.lane, "1")
        self.assertEqual(fq.filepath, fastq_filepath)
        self.assertEqual(fq.run_name, "160511_M03543_0047_000000000-APE6Y")

        self.assertDictEqual(fq.folder_info, folder_info)

        fastq_file = StringIO(
            u"@D00727:27:CA7HHANXX:1:1105:1243:1992 1:N:0:NGATCAGT+NNAAGGAG")
        fastq_filepath = (
            "Hiseq/170330_D00727_0027_ACA7HHANXX/Data/Intensities/"
            "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz")
        folder_info = {"date":"170330", "instrument":"D00727", "run_number":"27", "flowcell_id":"CA7HHANXX", "lane":"1", "read_or_index":"R", "read":"1"}
        fastq_file.name = fastq_filepath
        fq = IlluminaFastq(fastq_file)

        self.assertEqual(fq.machine_type, "Illumina-HiSeq")
        self.assertEqual(fq.date, "2017-03-30")
        self.assertEqual(fq.lane, "1")
        self.assertEqual(fq.filepath, fastq_filepath)
        self.assertEqual(fq.run_name, "170330_D00727_0027_ACA7HHANXX")

        self.assertDictEqual(fq.folder_info, folder_info)
Ejemplo n.º 2
0
 def test_check_file_size(self):
     curr_dir = os.path.dirname(os.path.abspath(__file__))
     fastq_filepath = os.path.join(
         curr_dir,
         "170323_M04734_0028_000000000-B2MVT/Undetermined_S0_L001_R1_001.fastq.gz"
     )
     fq = IlluminaFastq(gzip.open(fastq_filepath, mode='rt'))
     self.assertTrue(fq.check_file_size(50))
     self.assertFalse(fq.check_file_size(50000))
Ejemplo n.º 3
0
    def test_backup_fastq(self):
        has_index = True
        min_file_size = 5
        backup_fastq(self.fastq_filepath, self.temp_out_dir,
                     self.sample_sheet_fp, has_index, min_file_size)

        # check the md5sums of the first fastq is the same
        fq = IlluminaFastq(gzip.GzipFile(self.fastq_filepath))
        out_fp = os.path.join(self.temp_out_dir, fq.build_archive_dir(),
                              os.path.basename(self.fastq_filepath))
        md5_orj = return_md5(self.fastq_filepath)
        md5_trans = return_md5(out_fp)
        self.assertEqual(md5_orj, md5_trans)

        # check the md5sum of the sample sheet
        ss_fp = os.path.join(self.temp_out_dir, fq.build_archive_dir(),
                             os.path.basename(self.sample_sheet_fp))
        self.assertEqual(return_md5(ss_fp), "92ef1ca7433cadb5267d822615cd15e2")

        # check write permissions of the files
        self.assertEqual(os.stat(out_fp).st_mode, 33060)
Ejemplo n.º 4
0
    def test_build_archive_dir(self):
        # for MiSeq
        fastq_file = StringIO(
            u"@M03543:47:C8LJ2ANXX:1:2209:1084:2044 1:N:0:NNNNNNNN+NNNNNNNN")
        fastq_filepath = (
            "Miseq/160511_M03543_0047_000000000-APE6Y/Data/Intensities/"
            "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz")
        fastq_file.name = fastq_filepath
        fq = IlluminaFastq(fastq_file)
        self.assertEqual(fq.build_archive_dir(), "160511_M03543_0047_000000000-APE6Y_L001")

        # for HiSeq
        fastq_file = StringIO(
            u"@D00727:27:CA7HHANXX:1:1105:1243:1992 1:N:0:NGATCAGT+NNAAGGAG")
        fastq_filepath = (
            "Hiseq/170330_D00727_0027_ACA7HHANXX/Data/Intensities/"
            "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz")
        fastq_file.name = fastq_filepath
        fq = IlluminaFastq(fastq_file)
        self.assertEqual(fq.build_archive_dir(), "170330_D00727_0027_ACA7HHANXX_L001")
Ejemplo n.º 5
0
 def test_check_index_read_exists(self):
     # test passing
     fastq_file = StringIO(
         u"@M04734:28:000000000-B2MVT:1:2106:17605:1940 1:N:0:TTTTTTTTTTTT+TCTTTCCCTACA")
     fastq_filepath = (
         "Miseq/170323_M04734_0028_000000000-B2MVT/Data/Intensities/"
         "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz")
     fastq_file.name = fastq_filepath
     fq = IlluminaFastq(fastq_file)
     self.assertTrue(fq.check_index_read_exists())
     
     # test failing
     fastq_file = StringIO(
         u"@M04734:28:000000000-B2MVT:1:2106:17605:1940 1:N:0:0")
     fastq_filepath = (
         "Miseq/170323_M04734_0028_000000000-B2MVT/Data/Intensities/"
         "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz")
     fastq_file.name = fastq_filepath
     fq = IlluminaFastq(fastq_file)
     self.assertFalse(fq.check_index_read_exists())
Ejemplo n.º 6
0
    def test_is_same_run(self):
        fastq_file = StringIO(
            u"@M03543:47:C8LJ2ANXX:1:2209:1084:2044 1:N:0:NNNNNNNN+NNNNNNNN")
        fastq_filepath = (
            "Miseq/160511_M03543_0047_000000000-APE6Y/Data/Intensities/"
            "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz")
        fastq_file.name = fastq_filepath
        fq = IlluminaFastq(fastq_file)
        fastq_file.seek(0)
        fq1 = IlluminaFastq(fastq_file)

        fastq_file = StringIO(
            u"@D00727:27:CA7HHANXX:1:1105:1243:1992 1:N:0:NGATCAGT+NNAAGGAG")
        fastq_filepath = (
            "Hiseq/170330_D00727_0027_ACA7HHANXX/Data/Intensities/"
            "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz")
        fastq_file.name = fastq_filepath
        fq2 = IlluminaFastq(fastq_file)

        self.assertTrue(fq.is_same_run(fq1))
        self.assertFalse(fq.is_same_run(fq2))
Ejemplo n.º 7
0
    def test_fp_vs_content(self):
        # check correct case for Miseq data
        fastq_file = StringIO(
            u"@M04734:28:000000000-B2MVT:1:2106:17605:1940 1:N:0:TTTTTTTTTTTT+TCTTTCCCTACA")
        fastq_filepath = (
            "Miseq/170323_M04734_0028_000000000-B2MVT/Data/Intensities/"
            "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz")
        fastq_file.name = fastq_filepath
        fq = IlluminaFastq(fastq_file)
        self.assertTrue(fq.check_fp_vs_content())

        # check correct case for Hiseq data
        fastq_file = StringIO(
            u"@D00727:27:CA7HHANXX:1:1105:1243:1992 1:N:0:NGATCAGT+NNAAGGAG")
        fastq_filepath = (
            "Hiseq/170330_D00727_0027_ACA7HHANXX/Data/Intensities/"
            "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz")
        fastq_file.name = fastq_filepath
        fq = IlluminaFastq(fastq_file)
        self.assertTrue(fq.check_fp_vs_content())

        # case when the lane number doesn't match
        fastq_file = StringIO(
            u"@M04734:28:000000000-B2MVT:3:2106:17605:1940 1:N:0:TTTTTTTTTTTT+TCTTTCCCTACA")
        fastq_filepath = (
            "Miseq/170323_M04734_0028_000000000-B2MVT/Data/Intensities/"
            "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz")
        fastq_file.name = fastq_filepath
        fq = IlluminaFastq(fastq_file)
        self.assertFalse(fq.check_fp_vs_content())

        # case when the flow cell ID doesn't match
        fastq_file = StringIO(
            u"@M04734:28:000000000-BBBBB:1:2106:17605:1940 1:N:0:TTTTTTTTTTTT+TCTTTCCCTACA")
        fastq_filepath = (
            "Miseq/170323_M04734_0028_000000000-B2MVT/Data/Intensities/"
            "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz")
        fastq_file.name = fastq_filepath
        fq = IlluminaFastq(fastq_file)
        self.assertFalse(fq.check_fp_vs_content())

        # case when the machine doesn't match
        fastq_file = StringIO(
            u"@D04734:28:000000000-BBBBB:1:2106:17605:1940 1:N:0:TTTTTTTTTTTT+TCTTTCCCTACA")
        fastq_filepath = (
            "Miseq/170323_M04734_0028_000000000-B2MVT/Data/Intensities/"
            "BaseCalls/Undetermined_S0_L001_R1_001.fastq.gz")
        fastq_file.name = fastq_filepath
        fq = IlluminaFastq(fastq_file)
        self.assertFalse(fq.check_fp_vs_content())

        # case when the read doesn't match
        ### important: It won't distinguish between R1 and I1.
        fastq_file = StringIO(
            u"@M04734:28:000000000-B2MVT:1:2106:17605:1940 1:N:0:TTTTTTTTTTTT+TCTTTCCCTACA")
        fastq_filepath = (
            "Miseq/170323_M04734_0028_000000000-B2MVT/Data/Intensities/"
            "BaseCalls/Undetermined_S0_L001_R2_001.fastq.gz")
        fastq_file.name = fastq_filepath
        fq = IlluminaFastq(fastq_file)
        self.assertFalse(fq.check_fp_vs_content())
Ejemplo n.º 8
0
def backup_fastq(forward_reads, dest_dir, sample_sheet_fp, has_index,
                 min_file_size):

    R1 = IlluminaFastq(gzip.GzipFile(forward_reads))

    # build the strings for the required files
    file_names_RI = build_fp_to_archive(forward_reads, has_index, R1.lane)

    # create the Illumina objects and check the files
    illumina_fastqs = []
    for fp in file_names_RI:
        illumina_temp = IlluminaFastq(gzip.GzipFile(fp))
        if not illumina_temp.check_fp_vs_content():
            raise ValueError(
                "The file path and header infromation don't match")
        if not illumina_temp.check_file_size(min_file_size):
            raise ValueError(
                "File {0} seems suspiciously small. Plese check if you have the correct file or lower the minimum file size threshold"
                .format(fp))
        if not illumina_temp.check_index_read_exists():
            warnings.warn(
                "No barcodes in headers. Were the fastq files generated properly?: {0}"
                .format(fp))
        illumina_fastqs.append(illumina_temp)

    # parse the info from the headers in EACH file and check they are consistent within each other
    if not all(
        [fastq.is_same_run(illumina_fastqs[0]) for fastq in illumina_fastqs]):
        raise ValueError("The files are not from the same run.")

    ## Archiving steps

    # make sure the sample sheet exists
    if not os.path.isfile(sample_sheet_fp):
        raise IOError(
            "Sample sheet does not exist: {}".format(sample_sheet_fp))

    # create the folder to write to
    write_dir = os.path.join(dest_dir, illumina_temp.build_archive_dir())

    # create the folder. If it exists exit
    if os.path.isdir(write_dir):
        raise IOError("The folder already exists: {}".format(write_dir))
    os.mkdir(write_dir)

    ### All the checks are done and the files are safe to archive!

    # move the files to the archive location and remove permission
    permission = stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH
    for fp in file_names_RI:
        shutil.copyfile(fp, os.path.join(write_dir, os.path.basename(fp)))
        os.chmod(os.path.join(write_dir, os.path.basename(fp)),
                 permission)  #this doesn't work on isilon

    # copy the sample sheet to destination folder
    shutil.copyfile(sample_sheet_fp,
                    os.path.join(write_dir, os.path.basename(sample_sheet_fp)))

    # write md5sums to a file
    md5s = [(os.path.basename(fp), return_md5(fp)) for fp in file_names_RI]
    md5out_fp = os.path.join(
        write_dir, ".".join([illumina_temp.build_archive_dir(), "md5"]))
    with open(md5out_fp, "w") as md5_out:
        [md5_out.write("\t".join(md5) + "\n") for md5 in md5s]