def filter_fastq(fname, barcodes, tmp_folder): """ Filter reads belonging to unselected barcodes args: ------ fname: a fastq file, support .gz, .txt, .bz2 file barcodes: a list contains selected barcodes tmp_folder: folder to store temp file output: ------ temporary file name """ if file_type(fname) == "gz": fin = gzip.open(fname, 'rb') elif file_type(fname) == "bz2": fin = bz2.BZ2File(fname, 'r') elif file_type(fname) == "txt": fin = open(fname, 'r') else: print(("error: unrecoginized fastq " + fname + " file format, only supports .gz, .bz2, .fastq")) sys.exit(1) if len(barcodes) == 0: print("error: no barcode is selected") sys.exit(1) else: barcodes = set(barcodes) fout = tempfile.NamedTemporaryFile(delete=False, dir=tmp_folder) fout_name = fout.name while True: cur_name = fin.readline() cur_read = fin.readline() cur_plus = fin.readline() cur_qual = fin.readline() if cur_name == "": break cur_barcode = cur_name.split(":")[0][1:] if cur_barcode in barcodes: fout.write(cur_name) fout.write(cur_read) fout.write(cur_plus) fout.write(cur_qual) fin.close() fout.close() return (fout_name)
def get_barcode_cov_from_bed(barcode_list, input_bed): """ Get barcode coverage from bed file Args: ----- barcode_dict: a list of pre-defined barcodes input_bed: a bed file Returns: ------ a dictionary contains barcode coverage """ if len(barcode_list) == 0: print("error: @get_barcode_cov_from_bam: barcode_list is empty!") sys.exit(1) if file_type(input_bed) == "gz": fin = gzip.open(input_bed, 'rb') barcode_dict = collections.defaultdict(lambda: 0) for _read in fin: barcode = _read.decode().split()[3].split(":")[0].upper() # approximate counting, a read is half fragment barcode_dict[barcode] += 1 elif file_type(input_bed) == "bz2": fin = bz2.BZ2File(input_bed, 'r') barcode_dict = collections.defaultdict(lambda: 0) for _read in fin: barcode = _read.decode().split()[3].split(":")[0].upper() # approximate counting, a read is half fragment barcode_dict[barcode] += 1 elif file_type(input_bed) == "txt": fin = open(input_bed, 'r') barcode_dict = collections.defaultdict(lambda: 0) for _read in fin: barcode = _read.split()[3].split(":")[0].upper() # approximate counting, a read is half fragment barcode_dict[barcode] += 1 else: print( "error: unrecoginized bed file format, only supports .gz, .bz2, .fastq" ) sys.exit(1) fin.close() return barcode_dict
def group_reads_by_barcode_bed(input_bed): """ Group fargments based on the barcodes Args: input_bed: a bed file Returns: Generator that contains reads sharing the same barcode """ if not os.path.exists(input_bed): print(("Error @group_reads_by_barcode_bam: " + input_bed + " does not exist!")) read_group_list = [] pre_barcode = "" if file_type(input_bed) == "gz": fin = gzip.open(input_bed, 'rb') elif file_type(input_bed) == "bz2": fin = bz2.BZ2File(input_bed, 'r') elif file_type(input_bed) == "txt": fin = open(input_bed, 'r') else: print( "error: unrecoginized fastq file format, only supports .gz, .bz2, .fastq" ) sys.exit(1) for cur_read in fin: if type(cur_read) is bytes: cur_read = cur_read.decode() cur_barcode = cur_read.split()[3].split(":")[0].upper() if cur_barcode == pre_barcode: read_group_list.append(cur_read) else: if pre_barcode != "": # return read group yield (x for x in read_group_list) read_group_list = [cur_read] # add the first read pre_barcode = cur_barcode # reads from the last barcode yield (x for x in read_group_list) fin.close()
def count_barcode_cov_from_fastq(fname): """ Count barcode coverage from fastq file args: ----- fname: a fastq file, support .gz, .txt, .bz2 file output: ----- a dictionary contains barode and its coverage """ if file_type(fname) == "gz": fin = gzip.open(fname, 'rb') elif file_type(fname) == "bz2": fin = bz2.BZ2File(fname, 'r') elif file_type(fname) == "txt": fin = open(fname, 'r') else: print( "error: unrecoginized fastq file format, only supports .gz, .bz2, .fastq" ) sys.exit(1) barcode_cov = collections.defaultdict(lambda: 0) while True: cur_name = fin.readline().strip()[1:] cur_read = fin.readline().strip() cur_plus = fin.readline().strip() cur_qual = fin.readline().strip() if cur_name == "": break cur_barcode = cur_name.split(":")[0] barcode_cov[cur_barcode] += 1 fin.close() return (barcode_cov)
def dex_fastq(input_fastq, output_fastq, index_fastq_list): """ De-multiplex fastq files by adding barcode to the beginning of each read name. Required: -------- input_fastq: a fastq format file that contains the sequencing reads; output_fastq: a fastq file contains output fastq file; index_fastq_list: a list of fastq files that contains the barcode """ # check wheather snap file exists if not os.path.exists(input_fastq): print(('error: ' + input_fastq + ' does not exist!')) sys.exit(1) if os.path.exists(output_fastq): print(('error: ' + output_fastq + ' already exists, remove it first!')) sys.exit(1) for index_fastq in index_fastq_list: if not os.path.exists(index_fastq): print(('error: ' + index_fastq + ' does not exist!')) sys.exit(1) if file_type(input_fastq) == "gz": fr1 = gzip.open(input_fastq, 'rb') elif file_type(input_fastq) == "bz2": fr1 = bz2.BZ2File(input_fastq, 'r') elif file_type(input_fastq) == "txt": fr1 = open(input_fastq, 'r') index_files = [] for index_fastq in index_fastq_list: if file_type(index_fastq) == "gz": fix = gzip.open(index_fastq, 'rb') elif file_type(index_fastq) == "bz2": fix = bz2.BZ2File(index_fastq, 'r') elif file_type(index_fastq) == "txt": fix = open(index_fastq, 'r') index_files.append(fix) if output_fastq.endswith("gz"): fout = gzip.open(output_fastq, 'wb') elif output_fastq.endswith("bz2"): fout = bz2.BZ2File(output_fastq, 'w') else: fout = open(output_fastq, 'w') while True: cur_r1_name = fr1.readline().strip()[1:] if cur_r1_name == "": break cur_r1_read = fr1.readline().strip() cur_r1_plus = fr1.readline().strip() cur_r1_qual = fr1.readline().strip() cur_idex_list = [] for fix in index_files: cur_name = fix.readline().strip()[1:] cur_read = fix.readline().strip() cur_plus = fix.readline().strip() cur_qual = fix.readline().strip() cur_idex_list.append(cur_read) cur_barcode = "".join(cur_idex_list) if not (cur_name.split()[0] == cur_r1_name.split()[0]): sys.exit("read name does not match") fout.write('@' + cur_barcode + ':' + cur_r1_name + "\n") fout.write(cur_r1_read + "\n") fout.write("+\n") fout.write(cur_r1_qual + "\n") fout.close() fr1.close() for fix in index_files: fix.close()
def run_align_se(input_reference, input_fastq1, output_bam, aligner, path_to_aligner, read_fastq_command, num_threads, min_cov, aligner_options, if_sort, tmp_folder, overwrite): """ Map single-cell ATAC-seq reads in single-end mode Required -------- input_reference: reference genome file generated by index_reference input_fastq1: a fastq file contains R1 reads, supports .fq, .fastq, .gz, .bz2 output_bam: a bam file contains alignments Optional -------- path_to_aligner: directory path access to the aligner aligner: aligner name "bwa", "bowtie", "bowtie2" or "minimap2" aligner_options is a list of strings indicating options you'd like passed to aligner. (default for bowtie2: "-X 1000 -k 2 --no-mixed --no-discordant") (default for bowtie: "-X 1000 -S -k 1 -m 1 --best --strata --chunkmbs 64 -n 1") (default for bwa: "mem") (default for minimap2: "-ax sr --secondary=no") num_threads: number of mapping threads [3]; if_sort: if sort the alignment based on read name [True]; tmp_folder: where to store the temporary files [None]; min_cov: barcodes of fragments fewer than min_cov will be filtered before alingment; read_fastq_command: command to uncompress a compressed fastq file i.e. 'zcat', 'bzcat' [None]; overwrite: whether to overwrite the output file if it already exists [False]; """ # if the aligner path given, need to check the existance of the aligner if path_to_aligner != None: path_to_aligner += "/" if not os.path.isdir(path_to_aligner): print('Error: path_to_aligner is not a folder') sys.exit(1) if not os.path.exists(path_to_aligner + aligner): print('Error: aligner does not exist') sys.exit(1) else: try: # pipe output to /dev/null for silence null = open("/dev/null", "w") subprocess.Popen(aligner, stdout=null, stderr=null) null.close() except OSError as e: print(('Error: ' + aligner + ' does not exist!')) sys.exit(1) path_to_aligner = "" if (tmp_folder != None): if not os.path.isdir(tmp_folder): print('Error: tmp_folder is not a folder or does not exist') sys.exit(1) # check the existance of input and output files if not os.path.exists(input_fastq1): sys.exit('Error: ' + input_fastq1 + ' does not exist!') if os.path.isfile(output_bam): if overwrite: subprocess.check_call(["rm", output_bam]) else: sys.exit("error: \'%s\' already exists, remove it first" % output_bam) # check if can create the output_bam file try: with open(output_bam, "w") as outfile: outfile.write('') subprocess.check_call(["rm", output_bam]) except IOError: print(("error: could not create %s, check if the folder exists." % output_bam)) sys.exit(1) if min_cov > 0: barcode_dict = count_barcode_cov_from_fastq(input_fastq1) barcode_sel = set( [key for key in barcode_dict if barcode_dict[key] > min_cov]) if len(barcode_sel) == 0: print( "error: no barcode contains fragments more than --min-cov, lower --min-cov and try it again!" ) sys.exit(1) input_fastq1 = filter_fastq(input_fastq1, barcode_sel, tmp_folder) read_fastq_command = "cat" # check validity of aligner aligner = aligner.lower() if aligner not in ["bwa", "bowtie", "bowtie2", "minimap2"]: sys.exit('Error: only support bwa, bowtie, bowtie2, minimap2') # default aligner option if aligner_options is None: if aligner.lower() == "minimap2": aligner_options = ["-ax", "sr", "--secondary=no"] elif aligner.lower() == "bowtie": aligner_options = [ "-S", "-k 1", "-m 1", "--best", "--strata", "--chunkmbs 3072", "-n 1", "-e 100" ] aligner_options.append("--phred33-quals") elif aligner.lower() == "bowtie2": # bowtie2 aligner_options = [] aligner_options.append("--phred33-quals") elif aligner.lower() == "bwa": # bowtie2 aligner_options = ["mem"] options = aligner_options # update num_threads if it is given in the aligner_options if aligner in ["bowtie", "bowtie2"]: if " ".join(options).find(" -p ") == -1: options.append("-p " + str(num_threads)) elif aligner in ["minimap2", "bwa"]: if " ".join(options).find(" -t ") == -1: options.append("-t " + str(num_threads)) else: sys.exit('Error: only support bwa, bowtie, bowtie2, minimap2') # if cat_cmd is not given, automatically detect file type and choose cat_cmd if read_fastq_command == None: if file_type(input_fastq1) == "gz": read_fastq_command = "zcat" elif file_type(input_fastq1) == "bz2": read_fastq_command = "bzcat" elif file_type(input_fastq1) == "txt": # .fq or fastq file read_fastq_command = "cat" else: sys.exit( 'Error: unrecoganized fastq file, supports .fq, .fastq, .gz, .bz2 file' ) # mapping and write the alignments into a temporary file if aligner.lower() == "minimap2": args = [path_to_aligner + "minimap2"] args.extend(options) args.append(input_reference) args.append("<(" + read_fastq_command + " " + input_fastq1 + ")") elif aligner.lower() == "bowtie": args = [path_to_aligner + "bowtie"] args.extend(options) args.append(input_reference) args.append("-1 " + "<(" + read_fastq_command + " " + input_fastq1 + ")") elif aligner.lower() == "bowtie2": # bowtie2 args = [path_to_aligner + "bowtie2"] args.extend(options) args.append("-x " + input_reference) args.append("-1 " + "<(" + read_fastq_command + " " + input_fastq1 + ")") else: args = [path_to_aligner + "bwa"] args.extend(options) args.append(input_reference) args.append("<(" + read_fastq_command + " " + input_fastq1 + ")") ftmp = tempfile.NamedTemporaryFile(delete=False, dir=tmp_folder) try: subprocess.check_call(" ".join(args), stdout=ftmp, shell=True, executable='/bin/bash') except subprocess.CalledProcessError as e: sys.exit( 'Error: failed to run alignment, check if aligner and reference genome is correct!' ) ftmp.close() if (if_sort): pysam.sort("-n", "-@", str(num_threads), "-o", output_bam, ftmp.name) else: samfile = pysam.AlignmentFile(ftmp.name, "r") fout = pysam.AlignmentFile(output_bam, "wb", template=samfile) for read in samfile.fetch(): fout.write(read) fout.close() samfile.close() subprocess.check_call(["rm", ftmp.name]) # remove tmp fastq file after alignment if min_cov > 0: subprocess.check_call(["rm", input_fastq1]) return 0
def dex_fastq(input_fastq, output_fastq, index1_fastq, index2_fastq, index_list): """ Decomplex fastq file by adding barcode to the beginning of the read name. Args: input_fastq: fastq file contains sequencing reads (demo.R1.fastq.gz), support .fastq, .gz, .bz2 index1_fastq: fastq file contains r7, i7 barcode (demo.I1.fastq.gz), support .fastq, .gz, .bz2 index2_fastq: fastq file contains r5, i5 barcode (demo.I2.fastq.gz), support .fastq, .gz, .bz2 index_list: txt file contains pre-designed r7, i7, r5, i5 barcodes (barcodes.txt) """ # check if those files exist if not os.path.isfile(input_fastq): exit("error: \'%s\' not exist" % input_fastq) if not os.path.isfile(index1_fastq): exit("error: \'%s\' not exist" % index1_fastq) if not os.path.isfile(index2_fastq): exit("error: \'%s\' not exist" % index2_fastq) if not os.path.isfile(index_list): exit("error: \'%s\' not exist" % index_list) if os.path.isfile(output_fastq): exit("error: \'%s\' already exists, remove it first" % output_fastq) # check if they are fastq file #if not is_fastq(input_fastq): exit("error: \'%s\' is not a fastq file" % input_fastq) #if not is_fastq(input_fastq): exit("error: \'%s\' is not a fastq file" % index1_fastq) #if not is_fastq(input_fastq): exit("error: \'%s\' is not a fastq file" % index2_fastq) # check barcodes r7_dict = collections.defaultdict(int) i7_dict = collections.defaultdict(int) r5_dict = collections.defaultdict(int) i5_dict = collections.defaultdict(int) with open(index_list) as fin: for line in fin: elems = line.split() if (len(elems) != 2): continue if elems[1] == "r7": r7_dict[elems[0].upper()] = 0 elif elems[1] == "i7": i7_dict[elems[0].upper()] = 0 elif elems[1] == "r5": r5_dict[elems[0].upper()] = 0 elif elems[1] == "i5": i5_dict[elems[0].upper()] = 0 else: exit( "error: unorganized index \'%s\', only support r7, i7, r5, i5" % elems[1]) # check if index is the same length if (len(set(map(len, r7_dict.keys()))) != 1): exit("error: r7 index has different length") if (len(set(map(len, i7_dict.keys()))) != 1): exit("error: i7 index has different length") if (len(set(map(len, r5_dict.keys()))) != 1): exit("error: r5 index has different length") if (len(set(map(len, i5_dict.keys()))) != 1): exit("error: i5 index has different length") r7_len = len(r7_dict.keys()[0]) i7_len = len(i7_dict.keys()[0]) r5_len = len(r5_dict.keys()[0]) i5_len = len(i5_dict.keys()[0]) if file_type(index1_fastq) == "gz": fi1 = gzip.open(index1_fastq, 'rb') elif file_type(index1_fastq) == "bz2": fi1 = bz2.BZ2File(index1_fastq, 'r') elif file_type(index1_fastq) == "txt": fi1 = open(index1_fastq, 'r') if file_type(index2_fastq) == "gz": fi2 = gzip.open(index2_fastq, 'rb') elif file_type(index2_fastq) == "bz2": fi2 = bz2.BZ2File(index2_fastq, 'r') elif file_type(index2_fastq) == "txt": fi2 = open(index2_fastq, 'r') if file_type(input_fastq) == "gz": fr1 = gzip.open(input_fastq, 'rb') elif file_type(input_fastq) == "bz2": fr1 = bz2.BZ2File(input_fastq, 'r') elif file_type(input_fastq) == "txt": fr1 = open(input_fastq, 'r') if output_fastq.endswith("gz"): fout = gzip.open(output_fastq, 'wb') elif output_fastq.endswith("bz2"): fout = bz2.BZ2File(output_fastq, 'w') else: fout = open(output_fastq, 'w') TOTAL_READS = 0 # number of totally sequenced reads QUALI_READS = 0 # number of usable reads while True: cur_i1_name = fi1.readline().strip()[1:] cur_i1_read = fi1.readline().strip() cur_i1_plus = fi1.readline().strip() cur_i1_qual = fi1.readline().strip() cur_i2_name = fi2.readline().strip()[1:] cur_i2_read = fi2.readline().strip() cur_i2_plus = fi2.readline().strip() cur_i2_qual = fi2.readline().strip() cur_r1_name = fr1.readline().strip()[1:] cur_r1_read = fr1.readline().strip() cur_r1_plus = fr1.readline().strip() cur_r1_qual = fr1.readline().strip() if cur_i1_name == "" or cur_i2_name == "" or cur_r1_name == "": break if not (cur_i1_name.split()[0] == cur_i2_name.split()[0] == cur_r1_name.split()[0]): sys.exit("error: read name does not match") TOTAL_READS += 1 cur_r7 = cur_i1_read[:r7_len].upper() cur_i7 = cur_i1_read[-i7_len:].upper() cur_i5 = cur_i2_read[:r5_len].upper() cur_r5 = cur_i2_read[-i5_len:].upper() if (cur_i5 in i5_dict) and (cur_i7 in i7_dict) and ( cur_r5 in r5_dict) and (cur_r7 in r7_dict): QUALI_READS += 1 cur_barcode_cur = cur_r7 + cur_i7 + cur_i5 + cur_r5 fout.write('@' + cur_barcode_cur + ':' + cur_r1_name + "\n") fout.write(cur_r1_read + "\n") fout.write("+\n") fout.write(cur_r1_qual + "\n") r7_dict[cur_r7] += 1 r5_dict[cur_r5] += 1 i7_dict[cur_i7] += 1 i5_dict[cur_i5] += 1 fi1.close() fi2.close() fr1.close() fout.close() #### generate a report print("Total number of sequencing reads: ", TOTAL_READS) print("Total number of usable reads: ", QUALI_READS) print "==========================================" for key in r7_dict: print('%s\t%s\t%.2f%%' % (key, "r7", round(float(r7_dict[key]) / QUALI_READS * 100, 2))) for key in i7_dict: print('%s\t%s\t%.2f%%' % (key, "i7", round(float(i7_dict[key]) / QUALI_READS * 100, 2))) for key in r5_dict: print('%s\t%s\t%.2f%%' % (key, "r5", round(float(r5_dict[key]) / QUALI_READS * 100, 2))) for key in i5_dict: print('%s\t%s\t%.2f%%' % (key, "i5", round(float(i5_dict[key]) / QUALI_READS * 100, 2))) return 0