Ejemplo n.º 1
0
def test_multiplex_umitools_extract_fq(extract_umis, expected_fixture, dir_tmp,
                                       multiplex_name):
    """
    Test ``umi_tools extract`` multiplexed FASTQ files for
    equality. See :py:func:`riboviz.fastq.equal_fastq`.

    Skipped by ``pytest`` automatically if ``multiplex_name``
    fixture is not injected.

    Skipped if :py:const:`riboviz.params.EXTRACT_UMIS` is ``False``.

    :param extract_umi: Configuration parameter
    :type extract_umis: bool
    :param expected_fixture: Expected data directory
    :type expected_fixture: str or unicode
    :param dir_tmp: Temporary directory
    :type dir_tmp: str or unicode
    :param multiplex_name: Multiplexed FASTQ file name prefix
    :type multiplex_name: str or unicode
    """
    if not extract_umis:
        pytest.skip('Skipped test as extract_umis: {}'.format(extract_umis))
    file_name = workflow_files.UMI_EXTRACT_FQ_FORMAT.format(multiplex_name)
    dir_tmp_name = os.path.basename(os.path.normpath(dir_tmp))
    fastq.equal_fastq(os.path.join(expected_fixture, dir_tmp_name, file_name),
                      os.path.join(dir_tmp, file_name))
Ejemplo n.º 2
0
def test_demultiplex_gz(tmp_dir, file_format):
    """
    Test :py:func:`riboviz.demultiplex_fastq.demultiplex` using
    GZIPped FASTQ files.

    Each ``file_format`` consists of a FASTQ GZIP file name format and
    the corresponding non-GZIP FASTQ file name format.

    :param tmp_dir: Temporary directory
    :type tmp_dir: str or unicode
    :param file_format: File name format
    :type file_format: tuple(str or unicode, str or unicode)
    """
    gz_fmt, fmt = file_format
    tmp_fastq_file = os.path.join(tmp_dir,
                                  gz_fmt.format("test_multiplex"))
    with open(os.path.join(riboviz.test.SIMDATA_DIR,
                           "multiplex.fastq"), "rb") as fr:
        with gzip.open(tmp_fastq_file, "wb") as fw:
            shutil.copyfileobj(fr, fw)
    demultiplex_fastq.demultiplex(
        os.path.join(riboviz.test.SIMDATA_DIR,
                     "multiplex_barcodes.tsv"),
        tmp_fastq_file,
        mismatches=2,
        out_dir=tmp_dir)

    actual_num_reads = os.path.join(
        tmp_dir,
        demultiplex_fastq.NUM_READS_FILE)
    expected_num_reads = os.path.join(
        riboviz.test.SIMDATA_DIR,
        "deplex",
        demultiplex_fastq.NUM_READS_FILE)
    utils.equal_tsv(expected_num_reads, actual_num_reads)
    for tag in ["Tag0", "Tag1", "Tag2", "Unassigned"]:
        # Actual data has extension matching lower-case version
        # of multiplexed file's extension.
        actual_fq_gz = os.path.join(tmp_dir,
                                    gz_fmt.lower().format(tag))
        actual_fq = os.path.join(tmp_dir,
                                 fmt.format(tag))
        # Simulated data always has a .fastq extension.
        expected_fq = os.path.join(riboviz.test.SIMDATA_DIR,
                                   "deplex",
                                   fastq.FASTQ_FORMAT.format(tag))
        # Decompress actual_fq_gz
        with gzip.open(actual_fq_gz, "rb") as fr:
            with open(actual_fq, "wb") as fw:
                shutil.copyfileobj(fr, fw)
        fastq.equal_fastq(expected_fq, actual_fq)
    # The definition of the simulated data means that Tag3 has no
    # matches, as Tag0|1|2 will match any barcodes first. Check
    # there is no Tag3-related output file.
    assert not os.path.exists(os.path.join(tmp_dir,
                                           gz_fmt.lower().format("Tag3")))
Ejemplo n.º 3
0
def compare_files(file1, file2, compare_names=True):
    """
    Compare two files for equality. The following functions are used
    to compare each type of file:

    * ``bai``: :py:func:`riboviz.utils.equal_file_sizes`
    * ``bam``: :py:func:`riboviz.sam_bam.equal_bam`
    * ``bedgraph``: :py:func:`riboviz.bedgraph.equal_bedgraph`
    * ``fq``: :py:func:`riboviz.fastq.equal_fastq`
    * ``h5``: :py:func:`riboviz.h5.equal_h5`
    * ``ht2``: :py:func:`riboviz.utils.equal_file_sizes`
    * ``pdf``: :py:func:`riboviz.utils.equal_file_names`
    * ``sam``: :py:func:`riboviz.sam_bam.equal_sam`
    * ``tsv``: :py:func:`riboviz.utils.equal_tsv`

    :param file1: File name
    :type file1: str or unicode
    :param file2: File name
    :type file2: str or unicode
    :param compare_names: Compare file names?
    :type: bool
    :raise AssertionError: If one or other file does not exist, \
    is a directory or their contents differ
    """
    assert os.path.exists(file1), "Non-existent file: %s" % file1
    assert os.path.exists(file2), "Non-existent file: %s" % file2
    assert not os.path.isdir(file1), "Directory: %s" % file1
    assert not os.path.isdir(file2), "Directory: %s" % file2
    if compare_names:
        utils.equal_file_names(file1, file2)
    ext = utils.get_file_ext(file1)
    if ext.endswith(tuple(["pdf"])):
        utils.equal_file_names(file1, file2)
    elif ext.endswith(tuple([hisat2.HT2_EXT, sam_bam.BAI_EXT])):
        utils.equal_file_sizes(file1, file2)
    elif ext.endswith(tuple([h5.H5_EXT])):
        h5.equal_h5(file1, file2)
    elif ext.endswith(tuple([bedgraph.BEDGRAPH_EXT])):
        bedgraph.equal_bedgraph(file1, file2)
    elif ext.endswith(tuple([sam_bam.BAM_EXT])):
        sam_bam.equal_bam(file1, file2)
    elif ext.endswith(tuple([sam_bam.SAM_EXT])):
        sam_bam.equal_sam(file1, file2)
    elif ext.endswith(tuple(["tsv"])):
        utils.equal_tsv(file1, file2)
    elif ext.endswith(tuple(fastq.FASTQ_ALL_EXTS)):
        fastq.equal_fastq(file1, file2)
    else:
        assert False, "Unknown file type: " + ext
Ejemplo n.º 4
0
def test_barcode_umi_extract(configuration_module):
    """
    Test that the results of barcode and UMI extraction are as expected.

    :param configuration_module: temporary configuration and \
    configuration file
    :type configuration_module: tuple(dict, str or unicode)
    """
    config, _ = configuration_module
    expected_output = os.path.join(riboviz.test.SIMDATA_DIR,
                                   fastq.FASTQ_FORMAT.format("multiplex"))
    actual_output = os.path.join(
        config[params.TMP_DIR],
        workflow_files.UMI_EXTRACT_FQ_FORMAT.format(
            "multiplex_umi_barcode_adaptor"))
    fastq.equal_fastq(expected_output, actual_output)
Ejemplo n.º 5
0
def test_demultiplex(tmp_dir, file_format):
    """
    Test :py:func:`riboviz.demultiplex_fastq.demultiplex`.

    :param tmp_dir: Temporary directory
    :type tmp_dir: str or unicode
    :param file_format: FASTQ file format
    :type file_format: str or unicode
    """
    tmp_fastq_file = os.path.join(tmp_dir,
                                  file_format.format("test_multiplex"))
    shutil.copyfile(os.path.join(riboviz.test.SIMDATA_DIR,
                                 "multiplex.fastq"),
                    tmp_fastq_file)
    demultiplex_fastq.demultiplex(
        os.path.join(riboviz.test.SIMDATA_DIR,
                     "multiplex_barcodes.tsv"),
        tmp_fastq_file,
        mismatches=2,
        out_dir=tmp_dir)

    actual_num_reads = os.path.join(
        tmp_dir,
        demultiplex_fastq.NUM_READS_FILE)
    expected_num_reads = os.path.join(
        riboviz.test.SIMDATA_DIR,
        "deplex",
        demultiplex_fastq.NUM_READS_FILE)
    utils.equal_tsv(expected_num_reads, actual_num_reads)
    for tag in ["Tag0", "Tag1", "Tag2", "Unassigned"]:
        # Actual data has extension matching lower-case version
        # of multiplexed file's extension.
        actual_fq = os.path.join(tmp_dir,
                                 file_format.lower().format(tag))
        # Simulated data always has a .fastq extension.
        expected_fq = os.path.join(riboviz.test.SIMDATA_DIR,
                                   "deplex",
                                   fastq.FASTQ_FORMAT.format(tag))
        fastq.equal_fastq(expected_fq, actual_fq)
    # The definition of the simulated data means that Tag3 has no
    # matches, as Tag0|1|2 will match any barcodes first. Check
    # there is no Tag3-related output file.
    assert not os.path.exists(os.path.join(tmp_dir,
                                           file_format.lower().format("Tag3")))
def test_umi_extract(configuration_module, sample_id):
    """
    Test that the results of UMI extraction are as expected.

    :param configuration_module: temporary configuration and \
    configuration file
    :type configuration_module: tuple(dict, str or unicode)
    :param sample_id: sample ID
    :type sample_id: str or unicode
    """
    config, _ = configuration_module
    expected_output = os.path.join(
        riboviz.test.SIMDATA_DIR,
        fastq.FASTQ_FORMAT.format(sample_id))
    actual_output = os.path.join(
        config[params.TMP_DIR],
        sample_id,
        workflow_files.UMI_EXTRACT_FQ)
    fastq.equal_fastq(expected_output, actual_output)
def test_adaptor_trimming(configuration_module, sample_id):
    """
    Test that the results of adaptor trimming are as expected.

    :param configuration_module: temporary configuration and \
    configuration file
    :type configuration_module: tuple(dict, str or unicode)
    :param sample_id: sample ID
    :type sample_id: str or unicode
    """
    config, _ = configuration_module
    expected_output = os.path.join(
        riboviz.test.SIMDATA_DIR,
        fastq.FASTQ_FORMAT.format(sample_id + "_umi"))
    actual_output = os.path.join(
        config[params.TMP_DIR],
        sample_id,
        workflow_files.ADAPTER_TRIM_FQ)
    fastq.equal_fastq(expected_output, actual_output)
Ejemplo n.º 8
0
def test_multiplex_cutadapt_fq(expected_fixture, dir_tmp, multiplex_name):
    """
    Test ``cutadapt`` multiplexed FASTQ files for equality. See
    :py:func:`riboviz.fastq.equal_fastq`.

    Skipped by ``pytest`` automatically if ``multiplex_name``
    fixture is not injected.

    :param expected_fixture: Expected data directory
    :type expected_fixture: str or unicode
    :param dir_tmp: Temporary directory
    :type dir_tmp: str or unicode
    :param multiplex_name: Multiplexed FASTQ file name prefix
    :type multiplex_name: str or unicode
    """
    file_name = workflow_files.ADAPTER_TRIM_FQ_FORMAT.format(multiplex_name)
    dir_tmp_name = os.path.basename(os.path.normpath(dir_tmp))
    fastq.equal_fastq(os.path.join(expected_fixture, dir_tmp_name, file_name),
                      os.path.join(dir_tmp, file_name))
Ejemplo n.º 9
0
def compare_fq_files(expected_fixture, directory, subdirectory, file_name):
    """
    Test FASTQ files for equality. See
    :py:func:`riboviz.fastq.equal_fastq`.

    :param expected_fixture: Expected data directory
    :type expected_fixture: str or unicode
    :param directory: Directory
    :type directory: str or unicode
    :param subdirectory: Subdirectory
    :type subdirectory: str or unicode
    :param file_name: File name
    :type file_name: str or unicode
    """
    directory_name = os.path.basename(os.path.normpath(directory))
    fastq.equal_fastq(
        os.path.join(expected_fixture, directory_name, subdirectory,
                     file_name),
        os.path.join(directory, subdirectory, file_name))
Ejemplo n.º 10
0
def test_deplex_reads(configuration_module, sample_id):
    """
    Test that the FASTQ files output by demultiplexing are as
    expected.

    :param configuration_module: temporary configuration and \
    configuration file
    :type configuration_module: tuple(dict, str or unicode)
    :param sample_id: sample ID for demultiplexed reads
    :type sample_id: str or unicode
    """
    # Actual data has a .fq extension.
    actual_file_name = fastq.FQ_FORMAT.format(sample_id)
    # Simulated data has a .fastq extension.
    expected_file_name = fastq.FASTQ_FORMAT.format(sample_id)
    config, _ = configuration_module
    actual_dir = os.path.join(
        config[params.TMP_DIR],
        workflow_files.DEPLEX_DIR_FORMAT.format(
            "multiplex_umi_barcode_adaptor"))
    actual_output = os.path.join(actual_dir, actual_file_name)
    expected_output = os.path.join(
        riboviz.test.SIMDATA_DIR, "deplex", expected_file_name)
    fastq.equal_fastq(expected_output, actual_output)