def move_barcode_to_name_in_fastq(filename, out_dir): if not os.path.exists(out_dir): os.system('mkdir ' + out_dir) outfilename = out_dir + '/%s' % os.path.basename(filename) if os.path.exists(outfilename): print("Output file {} exists... Not overwriting...".format(outfilename)) return else: print("Writing {}".format(outfilename)) outf = open(outfilename, 'w') fastq = HTSeq.FastqReader(filename) obs_let = set() # phred 33 for read in fastq: if len(read.seq) < 14: continue if min(read.qual[:9]) < 30: continue _seq = read.seq.decode() n_read = HTSeq.SequenceWithQualities( read.seq[9:], read.name.partition(' ')[0] + '#' + _seq[0:3] + _seq[7:9], read.qualstr[9:]) n_read.write_to_fastq_file(outf) return
def create_alnmt(start, end, chrom, name): ''' Given 1-based blast start, end and chromosome, return an HTSeq Alignment object ''' iv = create_genomic_interval(start, end, chrom) r = HTSeq.SequenceWithQualities(b"", str(name), b"") alnmt = HTSeq.Alignment(r, iv) return alnmt
def process_file(filename, output_filename): outf = open(output_filename, 'w') fastq = HTSeq.FastqReader(filename) for read in fastq: barcode = read.name[-9:] new_barcode = barcode[:3] + barcode[7:] n_read = HTSeq.SequenceWithQualities( read.seq, read.name[:-9] + new_barcode, read.qualstr) n_read.write_to_fastq_file(outf) outf.close()
def move_barcode_to_name_in_fastq(filename, out_dir): if not os.path.exists(out_dir): os.system('mkdir ' + out_dir) outf = open(out_dir + '/%s' % os.path.basename(filename), 'w') fastq = HTSeq.FastqReader(filename) obs_let = set() # phred 33 for read in fastq: if len(read.seq) < 14: continue if min(read.qualstr[:9]) < 53: continue n_read = HTSeq.SequenceWithQualities( read.seq[9:], read.name.partition(' ')[0] + '#' + read.seq[0:9], read.qualstr[9:]) n_read.write_to_fastq_file(outf)
def filter_n_split_read(intuple): read = intuple[0] min_base_qual = intuple[1] mean_quality = intuple[2] max_length = intuple[3] global processed_read_count global low_mean_base_qual_read global long_read_count global low_base_qual_read_count with processed_read_count.get_lock(): processed_read_count.value += 1 htread = HTSeq.SequenceWithQualities(read[0].encode(), read[1], read[2].encode()) read_len = len(htread.qual) read_mean_qual = sum(htread.qual) / float(read_len) read_lowqual_bases = sum(htread.qual < min_base_qual) ## format FASTQ entry to print read = ('@' + read[1], read[0], '+', read[2]) ## default no filter passing code filter_code = -1 if read_lowqual_bases < 2: if read_mean_qual >= mean_quality: if read_len <= max_length: ## set reads-passing-filters file code filter_code = 0 else: with long_read_count.get_lock(): long_read_count.value += 1 ## set lenght-discarded read file code filter_code = 1 else: with low_mean_base_qual_read.get_lock(): low_mean_base_qual_read.value += 1 else: with low_base_qual_read_count.get_lock(): low_base_qual_read_count.value += 1 ## return filter code and FASTQ formatted read return (filter_code, read)
def trim_filter_read(input_fastq_read1, input_fastq_read2, trimmed_read2_output, tagged_read1_output, **kwargs): """ trims the read to UMI_BC1_BC2_BC3, and filters the read by quality score. This should be done for read2. """ fastq_1_file = HTSeq.FastqReader(input_fastq_read1) fastq_2_file = HTSeq.FastqReader(input_fastq_read2) output_file = open(trimmed_read2_output, 'wa') output_reads = [] output_indices = [] for index, reads in tqdm(enumerate(zip(fastq_1_file, fastq_2_file))): read0 = reads[0] read = reads[1] if read[80:86] == 'CATTCG': # check for UMI/barcode quality quality = True read_umi = read[:10] bc3 = read[10:18] bc2 = read[48:56] bc1 = read[86:94] if sum(read_umi.qual < 20) > 1: quality = False elif sum(bc3.qual < 20) + sum(bc2.qual < 20) + sum( bc1.qual < 20) > 1: quality = False if quality: # TODO: tag read1 with the barcode + UMI? new_read = HTSeq.SequenceWithQualities( read_umi.seq + bc3.seq + bc2.seq + bc1.seq, read.name, read_umi.qualstr + bc3.qualstr + bc2.qualstr + bc1.qualstr, 'phred') output_indices.append(index) output_reads.append(new_read) new_read.write_to_fastq(output_file) return output_indices
import sys import matplotlib.pyplot as plt if len(sys.argv) < 3: print("Please enter input file (.sam) and output file (.fastq)!") exit() input_file = sys.argv[1] output_file = sys.argv[2] if not (input_file.endswith(".sam") and output_file.endswith(".fastq")): print("Please enter input file (.sam) and output file (.fastq)!") exit() import HTSeq import numpy as np alignment_file = HTSeq.SAM_Reader(input_file) len_reads=[] my_fastq_file = open( output_file, "w" ) for aln in alignment_file: if not aln.aligned: len_reads.append(len(aln.read.seq)) if len(aln.read.seq)>200: myread = HTSeq.SequenceWithQualities( aln.read.seq, aln.read.name, aln.read.qualstr ) myread.write_to_fastq_file( my_fastq_file ) my_fastq_file.close() import matplotlib.pyplot as plt %matplotlib inline plt.hist(len_reads, bins=10) plt.savefig(output_file+".png")
sam_file_name = samfile if not intersected_reads: intersected_reads = sam_file_to_dict[sam_file_name][ "idNotAlignedReads"] intersected_reads = list( set(intersected_reads).intersection( sam_file_to_dict[sam_file_name]["idNotAlignedReads"])) import HTSeq fastq_file = HTSeq.FastqReader(read_file) my_fastq_file = open( os.path.join(output_dir, extract_prefix + "_not_aligned_reads.fastq"), "w") for read in fastq_file: if any(read.name.split(" ")[0] in s for s in intersected_reads): myread = HTSeq.SequenceWithQualities(read.seq, read.name, read.qualstr) myread.write_to_fastq_file(my_fastq_file) my_fastq_file.close() if extracted_aligned: intersected_reads = [] for fqfile, samfile in sam_fasta_pairs: sam_file_name = samfile if not intersected_reads: intersected_reads = sam_file_to_dict[sam_file_name][ "idAlignedReads"] intersected_reads = list( set(intersected_reads).intersection( sam_file_to_dict[sam_file_name]["idAlignedReads"])) import HTSeq fastq_file = HTSeq.FastqReader(read_file)
"SRR2078287": "ACTCAGCAG", "SRR2078288": "ACGCAGCAG", "SRR2078289": "AGACAGCAG", "SRR2078290": "ATCCAGCAG", "SRR2078291": "ATGCAGCAG", "SRR2078292": "CTTCAGCAG" } inadapter = sample2inadapter.get(input_id) adapter = inadapter + "......" sequence_lthreshold = int(len(adapter)) + 32 sequence_qthreshold = 39 idread = 0 for seq, name, quals in fqr.reads(quality_values=True): allread = HTSeq.SequenceWithQualities(seq, name.decode("utf-8"), quals) if mean_quals(allread.qual) > sequence_qthreshold and len( seq) >= sequence_lthreshold: pass_quality += 1 pos_tag = search_adapter(adapter, seq) if len(pos_tag) == 2: has_adapter += 1 cage_tag_seq = seq[pos_tag[0]:pos_tag[1]] cage_tag_quals = quals[pos_tag[0]:pos_tag[1]] old_length = "length=" + str(len(seq)) if len(cage_tag_seq) >= 21: pass_length += 1 new_length = "length=" + str(len(cage_tag_seq)) str_name = name.decode("utf-8").replace(old_length, new_length) cutread = HTSeq.SequenceWithQualities(cage_tag_seq, str_name, cage_tag_quals)
for read in fastq: if min(read.qual[0:10]) < 20: continue else: read.write_to_fastq_file(fastq_out) if os.path.exists(output_filename): continue cmd = 'perl ../clip/CIMS/stripBarcode.pl -len 9 -format fastq' cmd += ' %s %s' % (filtered_filename, output_filename) print cmd os.system(cmd) else: cmd = 'perl ../clip/CIMS/stripBarcode.pl -len 9 -format fastq' cmd += ' %s %s' % (input_fastq, output_filename) print cmd os.system(cmd) else: fastq_out = open(output_filename, 'w') fastq = HTSeq.FastqReader(input_fastq) for read in fastq: # read is a SequenceWithQualities object. if min(read.qual[0:10]) < 20: continue else: # Trim and write out. read_out = HTSeq.SequenceWithQualities(read.seq[10:], read.name, read.qualstr[10:]) # read.seq = read.seq[10:] # read.qualstr = read.qualstr[10:] # read.qual = read.qual[10:] read_out.write_to_fastq_file(fastq_out) fastq_out.close()
def extract_bc_umi_seq(input_files, barcodes): seq_ofile_handles = {} index_ofile_handles = {} for bc in barcodes.keys(): sample = barcodes[bc] seq_ofile_handles[bc] = gzip.GzipFile("%s_genomic.fastq.gz" % sample, "w") index_ofile_handles[bc] = gzip.GzipFile("%s_BC_UMI.fastq.gz" % sample, "w") excluded_ofile = gzip.GzipFile("undetermined.fastq.gz", "w") for ifile_name in input_files: #print ifile_name if ifile_name.endswith(".gz"): ifile = gzip.GzipFile(ifile_name, "r") else: ifile = ifile_name total_read_cnt = 0 accepted_read_cnt = 0 all_bcs = {} for r in HTSeq.FastqReader(ifile): total_read_cnt += 1 #umi = r.seq[0:8] bc = r.seq[8:16] if bc in all_bcs: all_bcs[bc] += 1 else: all_bcs[bc] = 1 if bc in barcodes: accepted_read_cnt += 1 new_genomic_item = HTSeq.SequenceWithQualities( r.seq[16:(len(r.seq) + 1)], r.name, r.qualstr[16:(len(r.seq) + 1)]) new_index_item = HTSeq.SequenceWithQualities( r.seq[0:16], r.name, r.qualstr[0:16]) new_genomic_item.write_to_fastq_file(seq_ofile_handles[bc]) new_index_item.write_to_fastq_file(index_ofile_handles[bc]) else: r.write_to_fastq_file(excluded_ofile) print "\nTotal reads seen: %d\tAssigned to known barcodes: %d (%g %%)\tUndetermined: %d (%g %%)" % ( total_read_cnt, accepted_read_cnt, round(float(accepted_read_cnt) / total_read_cnt * 100, 2), total_read_cnt - accepted_read_cnt, round( float(total_read_cnt - accepted_read_cnt) / total_read_cnt * 100, 2)) print "\nTop 100 observed barcodes with frequencies:" for k, v in sorted(all_bcs.items(), key=itemgetter(1), reverse=True)[0:100]: found = "?" if not k in barcodes else barcodes[k] print "%s\t%d\t%s" % (k, v, found) for bc in barcodes.keys(): seq_ofile_handles[bc].close() index_ofile_handles[bc].close() excluded_ofile.close()