def test_multiplex_deplex_num_reads_tsv(expected_fixture, dir_tmp, multiplex_name): """ Test :py:const:`riboviz.tools.demultiplex_fastq` :py:const:`riboviz.demultiplex_fastq.NUM_READS_FILE` for equality. See :py:func:`compare_tsv_files`. Skipped by ``pytest`` automatically if ``multiplex_name`` fixture is not injected. :param expected_fixture: Expected data directory :type expected_fixture: str or unicode :param dir_tmp: Temporary directory :type dir_tmp: str or unicode :param multiplex_name: Multiplexed FASTQ file name prefix :type multiplex_name: str or unicode """ deplex_dir = workflow_files.DEPLEX_DIR_FORMAT.format(multiplex_name) dir_tmp_name = os.path.basename(os.path.normpath(dir_tmp)) # Override default TSV comparisons as some columns have string values. utils.equal_tsv(os.path.join(expected_fixture, dir_tmp_name, deplex_dir, demultiplex_fastq.NUM_READS_FILE), os.path.join(dir_tmp, deplex_dir, demultiplex_fastq.NUM_READS_FILE), ignore_row_order=True, na_to_empty_str=True)
def test_demultiplex_gz(tmp_dir, file_format): """ Test :py:func:`riboviz.demultiplex_fastq.demultiplex` using GZIPped FASTQ files. Each ``file_format`` consists of a FASTQ GZIP file name format and the corresponding non-GZIP FASTQ file name format. :param tmp_dir: Temporary directory :type tmp_dir: str or unicode :param file_format: File name format :type file_format: tuple(str or unicode, str or unicode) """ gz_fmt, fmt = file_format tmp_fastq_file = os.path.join(tmp_dir, gz_fmt.format("test_multiplex")) with open(os.path.join(riboviz.test.SIMDATA_DIR, "multiplex.fastq"), "rb") as fr: with gzip.open(tmp_fastq_file, "wb") as fw: shutil.copyfileobj(fr, fw) demultiplex_fastq.demultiplex( os.path.join(riboviz.test.SIMDATA_DIR, "multiplex_barcodes.tsv"), tmp_fastq_file, mismatches=2, out_dir=tmp_dir) actual_num_reads = os.path.join( tmp_dir, demultiplex_fastq.NUM_READS_FILE) expected_num_reads = os.path.join( riboviz.test.SIMDATA_DIR, "deplex", demultiplex_fastq.NUM_READS_FILE) utils.equal_tsv(expected_num_reads, actual_num_reads) for tag in ["Tag0", "Tag1", "Tag2", "Unassigned"]: # Actual data has extension matching lower-case version # of multiplexed file's extension. actual_fq_gz = os.path.join(tmp_dir, gz_fmt.lower().format(tag)) actual_fq = os.path.join(tmp_dir, fmt.format(tag)) # Simulated data always has a .fastq extension. expected_fq = os.path.join(riboviz.test.SIMDATA_DIR, "deplex", fastq.FASTQ_FORMAT.format(tag)) # Decompress actual_fq_gz with gzip.open(actual_fq_gz, "rb") as fr: with open(actual_fq, "wb") as fw: shutil.copyfileobj(fr, fw) fastq.equal_fastq(expected_fq, actual_fq) # The definition of the simulated data means that Tag3 has no # matches, as Tag0|1|2 will match any barcodes first. Check # there is no Tag3-related output file. assert not os.path.exists(os.path.join(tmp_dir, gz_fmt.lower().format("Tag3")))
def compare_files(file1, file2, compare_names=True): """ Compare two files for equality. The following functions are used to compare each type of file: * ``bai``: :py:func:`riboviz.utils.equal_file_sizes` * ``bam``: :py:func:`riboviz.sam_bam.equal_bam` * ``bedgraph``: :py:func:`riboviz.bedgraph.equal_bedgraph` * ``fq``: :py:func:`riboviz.fastq.equal_fastq` * ``h5``: :py:func:`riboviz.h5.equal_h5` * ``ht2``: :py:func:`riboviz.utils.equal_file_sizes` * ``pdf``: :py:func:`riboviz.utils.equal_file_names` * ``sam``: :py:func:`riboviz.sam_bam.equal_sam` * ``tsv``: :py:func:`riboviz.utils.equal_tsv` :param file1: File name :type file1: str or unicode :param file2: File name :type file2: str or unicode :param compare_names: Compare file names? :type: bool :raise AssertionError: If one or other file does not exist, \ is a directory or their contents differ """ assert os.path.exists(file1), "Non-existent file: %s" % file1 assert os.path.exists(file2), "Non-existent file: %s" % file2 assert not os.path.isdir(file1), "Directory: %s" % file1 assert not os.path.isdir(file2), "Directory: %s" % file2 if compare_names: utils.equal_file_names(file1, file2) ext = utils.get_file_ext(file1) if ext.endswith(tuple(["pdf"])): utils.equal_file_names(file1, file2) elif ext.endswith(tuple([hisat2.HT2_EXT, sam_bam.BAI_EXT])): utils.equal_file_sizes(file1, file2) elif ext.endswith(tuple([h5.H5_EXT])): h5.equal_h5(file1, file2) elif ext.endswith(tuple([bedgraph.BEDGRAPH_EXT])): bedgraph.equal_bedgraph(file1, file2) elif ext.endswith(tuple([sam_bam.BAM_EXT])): sam_bam.equal_bam(file1, file2) elif ext.endswith(tuple([sam_bam.SAM_EXT])): sam_bam.equal_sam(file1, file2) elif ext.endswith(tuple(["tsv"])): utils.equal_tsv(file1, file2) elif ext.endswith(tuple(fastq.FASTQ_ALL_EXTS)): fastq.equal_fastq(file1, file2) else: assert False, "Unknown file type: " + ext
def test_collate_orf_tpms_and_counts_tsv(expected_fixture, dir_out): """ Test :py:const:`riboviz.workflow_r.COLLATE_TPMS_R` TSV files for equality. See :py:func:`riboviz.utils.equal_tsv`. :param expected_fixture: Expected data directory :type expected_fixture: str or unicode :param dir_out: Output directory :type dir_out: str or unicode """ dir_out_name = os.path.basename(os.path.normpath(dir_out)) # Override default TSV comparisons as some columns have string values. utils.equal_tsv(os.path.join(expected_fixture, dir_out_name, workflow_r.TPMS_ALL_CDS_ALL_SAMPLES_TSV), os.path.join(dir_out, workflow_r.TPMS_ALL_CDS_ALL_SAMPLES_TSV), ignore_row_order=True, na_to_empty_str=True)
def test_deplex_num_reads(configuration_module): """ Test that the number of reads summary, produced during demultiplexing, is as expected. :param configuration_module: temporary configuration and \ configuration file :type configuration_module: tuple(dict, str or unicode) """ config, _ = configuration_module actual_dir = os.path.join( config[params.TMP_DIR], workflow_files.DEPLEX_DIR_FORMAT.format( "multiplex_umi_barcode_adaptor")) actual_output = os.path.join(actual_dir, demultiplex_fastq.NUM_READS_FILE) expected_output = os.path.join(riboviz.test.SIMDATA_DIR, "deplex", demultiplex_fastq.NUM_READS_FILE) utils.equal_tsv(expected_output, actual_output)
def test_demultiplex(tmp_dir, file_format): """ Test :py:func:`riboviz.demultiplex_fastq.demultiplex`. :param tmp_dir: Temporary directory :type tmp_dir: str or unicode :param file_format: FASTQ file format :type file_format: str or unicode """ tmp_fastq_file = os.path.join(tmp_dir, file_format.format("test_multiplex")) shutil.copyfile(os.path.join(riboviz.test.SIMDATA_DIR, "multiplex.fastq"), tmp_fastq_file) demultiplex_fastq.demultiplex( os.path.join(riboviz.test.SIMDATA_DIR, "multiplex_barcodes.tsv"), tmp_fastq_file, mismatches=2, out_dir=tmp_dir) actual_num_reads = os.path.join( tmp_dir, demultiplex_fastq.NUM_READS_FILE) expected_num_reads = os.path.join( riboviz.test.SIMDATA_DIR, "deplex", demultiplex_fastq.NUM_READS_FILE) utils.equal_tsv(expected_num_reads, actual_num_reads) for tag in ["Tag0", "Tag1", "Tag2", "Unassigned"]: # Actual data has extension matching lower-case version # of multiplexed file's extension. actual_fq = os.path.join(tmp_dir, file_format.lower().format(tag)) # Simulated data always has a .fastq extension. expected_fq = os.path.join(riboviz.test.SIMDATA_DIR, "deplex", fastq.FASTQ_FORMAT.format(tag)) fastq.equal_fastq(expected_fq, actual_fq) # The definition of the simulated data means that Tag3 has no # matches, as Tag0|1|2 will match any barcodes first. Check # there is no Tag3-related output file. assert not os.path.exists(os.path.join(tmp_dir, file_format.lower().format("Tag3")))
def compare_tsv_files(expected_fixture, directory, subdirectory, file_name): """ Test TSV files for equality. See :py:func:`riboviz.utils.equal_tsv`. :param expected_fixture: Expected data directory :type expected_fixture: str or unicode :param directory: Directory :type directory: str or unicode :param subdirectory: Subdirectory :type subdirectory: str or unicode :param file_name: file name :type file_name: str or unicode """ directory_name = os.path.basename(os.path.normpath(directory)) utils.equal_tsv( os.path.join(expected_fixture, directory_name, subdirectory, file_name), os.path.join(directory, subdirectory, file_name))