コード例 #1
0
ファイル: sam_bam.py プロジェクト: acope3/riboviz
def is_sam(file_name):
    """
    Does a file name have extension ``sam`` or ``SAM``?

    :param file_name: File name
    :type file_name: str or unicode
    :return: ``True`` if ``file_name`` has extension ``sam`` or ``SAM``
    :rtype: bool
    """
    ext = utils.get_file_ext(file_name)
    return ext.lower() == SAM_EXT
コード例 #2
0
ファイル: fastq.py プロジェクト: acope3/riboviz
def is_fastq_gz(file_name):
    """
    Does the given file end with ``gz``, ``GZ``, ``gzip`` or ``GZIP``?

    :param file_name: File name
    :type file_name: str or unicode
    :return: ``True`` or ``False``
    :rtype: bool
    """
    ext = utils.get_file_ext(file_name)
    return ext.lower() in FASTQ_GZ_EXTS
コード例 #3
0
ファイル: compare_files.py プロジェクト: acope3/riboviz
def compare_files(file1, file2, compare_names=True):
    """
    Compare two files for equality. The following functions are used
    to compare each type of file:

    * ``bai``: :py:func:`riboviz.utils.equal_file_sizes`
    * ``bam``: :py:func:`riboviz.sam_bam.equal_bam`
    * ``bedgraph``: :py:func:`riboviz.bedgraph.equal_bedgraph`
    * ``fq``: :py:func:`riboviz.fastq.equal_fastq`
    * ``h5``: :py:func:`riboviz.h5.equal_h5`
    * ``ht2``: :py:func:`riboviz.utils.equal_file_sizes`
    * ``pdf``: :py:func:`riboviz.utils.equal_file_names`
    * ``sam``: :py:func:`riboviz.sam_bam.equal_sam`
    * ``tsv``: :py:func:`riboviz.utils.equal_tsv`

    :param file1: File name
    :type file1: str or unicode
    :param file2: File name
    :type file2: str or unicode
    :param compare_names: Compare file names?
    :type: bool
    :raise AssertionError: If one or other file does not exist, \
    is a directory or their contents differ
    """
    assert os.path.exists(file1), "Non-existent file: %s" % file1
    assert os.path.exists(file2), "Non-existent file: %s" % file2
    assert not os.path.isdir(file1), "Directory: %s" % file1
    assert not os.path.isdir(file2), "Directory: %s" % file2
    if compare_names:
        utils.equal_file_names(file1, file2)
    ext = utils.get_file_ext(file1)
    if ext.endswith(tuple(["pdf"])):
        utils.equal_file_names(file1, file2)
    elif ext.endswith(tuple([hisat2.HT2_EXT, sam_bam.BAI_EXT])):
        utils.equal_file_sizes(file1, file2)
    elif ext.endswith(tuple([h5.H5_EXT])):
        h5.equal_h5(file1, file2)
    elif ext.endswith(tuple([bedgraph.BEDGRAPH_EXT])):
        bedgraph.equal_bedgraph(file1, file2)
    elif ext.endswith(tuple([sam_bam.BAM_EXT])):
        sam_bam.equal_bam(file1, file2)
    elif ext.endswith(tuple([sam_bam.SAM_EXT])):
        sam_bam.equal_sam(file1, file2)
    elif ext.endswith(tuple(["tsv"])):
        utils.equal_tsv(file1, file2)
    elif ext.endswith(tuple(fastq.FASTQ_ALL_EXTS)):
        fastq.equal_fastq(file1, file2)
    else:
        assert False, "Unknown file type: " + ext
コード例 #4
0
ファイル: test_utils.py プロジェクト: acope3/riboviz
def get_file_ext_name_dot_ext_ext():
    """
    Test :py:func:`riboviz.utils.get_file_ext` with
    ``example.fastq.gz``.
    """
    assert utils.get_file_ext("example.fastq.gz") == "fastq.gz"
コード例 #5
0
ファイル: test_utils.py プロジェクト: acope3/riboviz
def get_file_ext_name():
    """
    Test :py:func:`riboviz.utils.get_file_ext` with
    ``example``.
     """
    assert utils.get_file_ext("example") == ""
コード例 #6
0
ファイル: demultiplex_fastq.py プロジェクト: riboviz/riboviz
def demultiplex(sample_sheet_file,
                read1_file,
                read2_file=None,
                mismatches=1,
                out_dir=OUTPUT_DIR,
                delimiter=barcodes_umis.BARCODE_DELIMITER):
    """
    Demultiplex FASTQ files using UMI-tools-compliant barcodes present
    within the FASTQ headers and a sample sheet file. GZIPped FASTQ
    files can be handled too.

    The sample sheet is assumed to have a header with column names
    ``SampleID`` and ``TagRead``.

    ``read2_file``, if provided, must be the same format as
    ``read1_file`` i.e. if ``read1_file`` is GZIPped then
    ``read2_file`` must be also.

    :param sample_sheet_file: Sample sheet file name
    :type sample_sheet_file: str or unicode
    :param read1_file: FASTQ file name
    :type read1_file: str or unicode
    :param read2_file: FASTQ file name, for paired reads, or ``None``
    :type read2_file: str or unicode
    :param mismatches: Mismatches allowed
    :type mismatches: int
    :param out_dir: Output directory
    :type out_dir: str or unicode
    :param delimiter: Barcode delimiter
    :type delimiter: str or unicode
     """
    print(("Demultiplexing reads for file: " + read1_file))
    print(("Using sample sheet: " + sample_sheet_file))

    sample_sheet = sample_sheets.load_sample_sheet(sample_sheet_file)
    num_samples = sample_sheet.shape[0]
    sample_ids = list(sample_sheet[sample_sheets.SAMPLE_ID])
    barcodes = list(sample_sheet[sample_sheets.TAG_READ])
    print(("Number of samples: {}".format(num_samples)))
    print(("Allowed mismatches: {}".format(mismatches)))
    print(("Barcode delimiter: {}".format(delimiter)))
    num_reads = [0] * num_samples
    num_unassigned_reads = 0
    total_reads = 0

    if not os.path.isfile(read1_file):
        raise FileNotFoundError(
            "Error: read 1 file {} does not exist".format(read1_file))

    file_format = fastq.FASTQ_FORMATS[utils.get_file_ext(read1_file)]
    if fastq.is_fastq_gz(read1_file):
        open_file = gzip.open
    else:
        open_file = open

    read1_fh = open_file(read1_file, 'rt')
    is_paired_end = read2_file is not None
    if is_paired_end:
        if not os.path.isfile(read2_file):
            raise FileNotFoundError(
                "Error: read 2 file {} does not exist".format(read2_file))
        read2_fh = open_file(read2_file, 'rt')

    if not os.path.exists(out_dir):
        try:
            os.mkdir(out_dir)
        except Exception:
            raise IOError(
                "Error: output directory {} cannot be created".format(out_dir))
    elif os.path.isfile(out_dir):
        raise IOError(
            "Error: output directory {} cannot be created".format(out_dir))

    num_reads_file = os.path.join(out_dir, NUM_READS_FILE)
    if not is_paired_end:
        extension = ""
    else:
        extension = "_R1"
    read1_split_files = [
        os.path.join(out_dir, file_format.format(sample_id + extension))
        for sample_id in sample_ids
    ]
    read1_unassigned_file = os.path.join(
        out_dir, file_format.format(sample_sheets.UNASSIGNED_TAG + extension))
    read1_split_fhs = [
        open_file(file_name, "wt") for file_name in read1_split_files
    ]
    read1_unassigned_fh = open_file(read1_unassigned_file, "wt")
    if is_paired_end:
        read2_split_files = [
            os.path.join(out_dir, file_format.format(sample_id + "_R2"))
            for sample_id in sample_ids
        ]
        read2_split_fhs = [
            open_file(file_name, "wt") for file_name in read2_split_files
        ]
        read2_unassigned_file = os.path.join(
            out_dir, file_format.format(sample_sheets.UNASSIGNED_TAG + "_R2"))
        read2_unassigned_fh = open_file(read2_unassigned_file, "wt")
    else:
        read2_split_files = []
        read2_split_fhs = []
        read2_unassigned_file = None
        read2_unassigned_fh = None
    while True:
        # Get fastq record/read (4 lines)
        fastq_record1 = list(islice(read1_fh, 4))
        if not fastq_record1:
            break
        if is_paired_end:
            fastq_record2 = list(islice(read2_fh, 4))
        else:
            fastq_record2 = None
        # Count number of processed reads, output every millionth.
        total_reads += 1
        if (total_reads % 1000000) == 0:
            print(("{} reads processed".format(total_reads)))
        # Assign read to a SampleID,
        # TagRead is 1st read with less than threshold mismatches.
        # Beware: this could cause problems if many mismatches.
        is_assigned = assign_samples(fastq_record1, fastq_record2, barcodes,
                                     read1_split_fhs, read2_split_fhs,
                                     is_paired_end, num_reads, mismatches,
                                     delimiter)
        if not is_assigned:
            # Write unassigned read to file.
            # Note: unassigned reads are not trimmed.
            read1_unassigned_fh.writelines(fastq_record1)
            if is_paired_end:
                read2_unassigned_fh.writelines(fastq_record2)
            num_unassigned_reads += 1

    # Close output handles and fastq file.
    for fh in read1_split_fhs:
        fh.close()
    read1_unassigned_fh.close()
    read1_fh.close()
    if is_paired_end:
        for fh in read2_split_fhs:
            fh.close()
        read2_unassigned_fh.close()
        read2_fh.close()

    print(("All {} reads processed".format(total_reads)))

    # Purge files with no reads.
    for (_, index) in zip(sample_ids, range(len(sample_ids))):
        if num_reads[index] == 0:
            os.remove(read1_split_files[index])
            if is_paired_end:
                os.remove(read2_split_files[index])

    # Output number of reads by sample to file.
    sample_sheet[sample_sheets.NUM_READS] = num_reads
    sample_sheets.save_deplexed_sample_sheet(sample_sheet,
                                             num_unassigned_reads,
                                             num_reads_file)
    print(("Done"))