def is_sam(file_name): """ Does a file name have extension ``sam`` or ``SAM``? :param file_name: File name :type file_name: str or unicode :return: ``True`` if ``file_name`` has extension ``sam`` or ``SAM`` :rtype: bool """ ext = utils.get_file_ext(file_name) return ext.lower() == SAM_EXT
def is_fastq_gz(file_name): """ Does the given file end with ``gz``, ``GZ``, ``gzip`` or ``GZIP``? :param file_name: File name :type file_name: str or unicode :return: ``True`` or ``False`` :rtype: bool """ ext = utils.get_file_ext(file_name) return ext.lower() in FASTQ_GZ_EXTS
def compare_files(file1, file2, compare_names=True): """ Compare two files for equality. The following functions are used to compare each type of file: * ``bai``: :py:func:`riboviz.utils.equal_file_sizes` * ``bam``: :py:func:`riboviz.sam_bam.equal_bam` * ``bedgraph``: :py:func:`riboviz.bedgraph.equal_bedgraph` * ``fq``: :py:func:`riboviz.fastq.equal_fastq` * ``h5``: :py:func:`riboviz.h5.equal_h5` * ``ht2``: :py:func:`riboviz.utils.equal_file_sizes` * ``pdf``: :py:func:`riboviz.utils.equal_file_names` * ``sam``: :py:func:`riboviz.sam_bam.equal_sam` * ``tsv``: :py:func:`riboviz.utils.equal_tsv` :param file1: File name :type file1: str or unicode :param file2: File name :type file2: str or unicode :param compare_names: Compare file names? :type: bool :raise AssertionError: If one or other file does not exist, \ is a directory or their contents differ """ assert os.path.exists(file1), "Non-existent file: %s" % file1 assert os.path.exists(file2), "Non-existent file: %s" % file2 assert not os.path.isdir(file1), "Directory: %s" % file1 assert not os.path.isdir(file2), "Directory: %s" % file2 if compare_names: utils.equal_file_names(file1, file2) ext = utils.get_file_ext(file1) if ext.endswith(tuple(["pdf"])): utils.equal_file_names(file1, file2) elif ext.endswith(tuple([hisat2.HT2_EXT, sam_bam.BAI_EXT])): utils.equal_file_sizes(file1, file2) elif ext.endswith(tuple([h5.H5_EXT])): h5.equal_h5(file1, file2) elif ext.endswith(tuple([bedgraph.BEDGRAPH_EXT])): bedgraph.equal_bedgraph(file1, file2) elif ext.endswith(tuple([sam_bam.BAM_EXT])): sam_bam.equal_bam(file1, file2) elif ext.endswith(tuple([sam_bam.SAM_EXT])): sam_bam.equal_sam(file1, file2) elif ext.endswith(tuple(["tsv"])): utils.equal_tsv(file1, file2) elif ext.endswith(tuple(fastq.FASTQ_ALL_EXTS)): fastq.equal_fastq(file1, file2) else: assert False, "Unknown file type: " + ext
def get_file_ext_name_dot_ext_ext(): """ Test :py:func:`riboviz.utils.get_file_ext` with ``example.fastq.gz``. """ assert utils.get_file_ext("example.fastq.gz") == "fastq.gz"
def get_file_ext_name(): """ Test :py:func:`riboviz.utils.get_file_ext` with ``example``. """ assert utils.get_file_ext("example") == ""
def demultiplex(sample_sheet_file, read1_file, read2_file=None, mismatches=1, out_dir=OUTPUT_DIR, delimiter=barcodes_umis.BARCODE_DELIMITER): """ Demultiplex FASTQ files using UMI-tools-compliant barcodes present within the FASTQ headers and a sample sheet file. GZIPped FASTQ files can be handled too. The sample sheet is assumed to have a header with column names ``SampleID`` and ``TagRead``. ``read2_file``, if provided, must be the same format as ``read1_file`` i.e. if ``read1_file`` is GZIPped then ``read2_file`` must be also. :param sample_sheet_file: Sample sheet file name :type sample_sheet_file: str or unicode :param read1_file: FASTQ file name :type read1_file: str or unicode :param read2_file: FASTQ file name, for paired reads, or ``None`` :type read2_file: str or unicode :param mismatches: Mismatches allowed :type mismatches: int :param out_dir: Output directory :type out_dir: str or unicode :param delimiter: Barcode delimiter :type delimiter: str or unicode """ print(("Demultiplexing reads for file: " + read1_file)) print(("Using sample sheet: " + sample_sheet_file)) sample_sheet = sample_sheets.load_sample_sheet(sample_sheet_file) num_samples = sample_sheet.shape[0] sample_ids = list(sample_sheet[sample_sheets.SAMPLE_ID]) barcodes = list(sample_sheet[sample_sheets.TAG_READ]) print(("Number of samples: {}".format(num_samples))) print(("Allowed mismatches: {}".format(mismatches))) print(("Barcode delimiter: {}".format(delimiter))) num_reads = [0] * num_samples num_unassigned_reads = 0 total_reads = 0 if not os.path.isfile(read1_file): raise FileNotFoundError( "Error: read 1 file {} does not exist".format(read1_file)) file_format = fastq.FASTQ_FORMATS[utils.get_file_ext(read1_file)] if fastq.is_fastq_gz(read1_file): open_file = gzip.open else: open_file = open read1_fh = open_file(read1_file, 'rt') is_paired_end = read2_file is not None if is_paired_end: if not os.path.isfile(read2_file): raise FileNotFoundError( "Error: read 2 file {} does not exist".format(read2_file)) read2_fh = open_file(read2_file, 'rt') if not os.path.exists(out_dir): try: os.mkdir(out_dir) except Exception: raise IOError( "Error: output directory {} cannot be created".format(out_dir)) elif os.path.isfile(out_dir): raise IOError( "Error: output directory {} cannot be created".format(out_dir)) num_reads_file = os.path.join(out_dir, NUM_READS_FILE) if not is_paired_end: extension = "" else: extension = "_R1" read1_split_files = [ os.path.join(out_dir, file_format.format(sample_id + extension)) for sample_id in sample_ids ] read1_unassigned_file = os.path.join( out_dir, file_format.format(sample_sheets.UNASSIGNED_TAG + extension)) read1_split_fhs = [ open_file(file_name, "wt") for file_name in read1_split_files ] read1_unassigned_fh = open_file(read1_unassigned_file, "wt") if is_paired_end: read2_split_files = [ os.path.join(out_dir, file_format.format(sample_id + "_R2")) for sample_id in sample_ids ] read2_split_fhs = [ open_file(file_name, "wt") for file_name in read2_split_files ] read2_unassigned_file = os.path.join( out_dir, file_format.format(sample_sheets.UNASSIGNED_TAG + "_R2")) read2_unassigned_fh = open_file(read2_unassigned_file, "wt") else: read2_split_files = [] read2_split_fhs = [] read2_unassigned_file = None read2_unassigned_fh = None while True: # Get fastq record/read (4 lines) fastq_record1 = list(islice(read1_fh, 4)) if not fastq_record1: break if is_paired_end: fastq_record2 = list(islice(read2_fh, 4)) else: fastq_record2 = None # Count number of processed reads, output every millionth. total_reads += 1 if (total_reads % 1000000) == 0: print(("{} reads processed".format(total_reads))) # Assign read to a SampleID, # TagRead is 1st read with less than threshold mismatches. # Beware: this could cause problems if many mismatches. is_assigned = assign_samples(fastq_record1, fastq_record2, barcodes, read1_split_fhs, read2_split_fhs, is_paired_end, num_reads, mismatches, delimiter) if not is_assigned: # Write unassigned read to file. # Note: unassigned reads are not trimmed. read1_unassigned_fh.writelines(fastq_record1) if is_paired_end: read2_unassigned_fh.writelines(fastq_record2) num_unassigned_reads += 1 # Close output handles and fastq file. for fh in read1_split_fhs: fh.close() read1_unassigned_fh.close() read1_fh.close() if is_paired_end: for fh in read2_split_fhs: fh.close() read2_unassigned_fh.close() read2_fh.close() print(("All {} reads processed".format(total_reads))) # Purge files with no reads. for (_, index) in zip(sample_ids, range(len(sample_ids))): if num_reads[index] == 0: os.remove(read1_split_files[index]) if is_paired_end: os.remove(read2_split_files[index]) # Output number of reads by sample to file. sample_sheet[sample_sheets.NUM_READS] = num_reads sample_sheets.save_deplexed_sample_sheet(sample_sheet, num_unassigned_reads, num_reads_file) print(("Done"))