def calculate_total_num_mismatches(self): cigar_opr_list, cigar_opr_len_list = tk.analysis_cigar_string( self.cigar) if len(cigar_opr_list) != len(cigar_opr_len_list): tk.eprint('ERROR: len(cigar_opr_list) != len(cigar_opr_len_list)') sys.exit() self.total_num_edit_bases = 0 for i in range(0, len(cigar_opr_list)): cigar_opr = cigar_opr_list[i] cigar_opr_len = cigar_opr_len_list[i] if cigar_opr == '=': # match continue elif cigar_opr == 'X': # mismatch self.total_num_edit_bases += cigar_opr_len elif cigar_opr == 'I': # insertion self.total_num_edit_bases += cigar_opr_len elif cigar_opr == 'D': # deletion self.total_num_edit_bases += cigar_opr_len elif cigar_opr == 'S': continue else: tk.eprint('ERROR: unsupported cigar operation: %s' % cigar_opr) sys.exit() self.total_num_edit_bases += self.target_start self.total_num_edit_bases += self.target_len - self.target_end return
def init_from_file(self, barcode_fasta_file, amplicon_seq_fasta_file, barcode_side): self.barcode_fasta_file = barcode_fasta_file self.amplicon_seq_fasta_file = amplicon_seq_fasta_file self.side = barcode_side self.barcode_name_list, self.barcode_seq_list = tk.read_fasta_file( barcode_fasta_file) fasta_name_list, fasta_seq_list = tk.read_fasta_file( amplicon_seq_fasta_file) if len(fasta_name_list) > 1: tk.eprint( 'ERROR: There are more than 1 sequence in the amp_seq_fasta file: %s' % amplicon_seq_fasta_file) sys.exit() if len(fasta_name_list) == 1 and len(fasta_seq_list) == 1: self.amplicon_name = fasta_name_list[0] self.amplicon_seq = fasta_seq_list[0] else: tk.eprint( 'ERROR: No sequence was found in the amp_seq_fasta file: %s' % amplicon_seq_fasta_file) sys.exit() if barcode_side == 'fwd': self.downstream_seq = self.amplicon_seq[0:self.anchor_seq_len] elif barcode_side == 'rev': self.downstream_seq = tk.rev_comp( self.amplicon_seq[-self.anchor_seq_len:]) return
def extract_fastq_tail_seq(in_fastq_file, read_tail_length, min_read_length, max_read_length, left_tail_fastq_file, right_tail_fastq_file): in_fastq_fp = tk.gzopen(in_fastq_file) fq_left_tail_fp = open(left_tail_fastq_file, 'w') fq_right_tail_fp = open(right_tail_fastq_file, 'w') num_skipped_reads = 0 num_processd_reads = 0 while 1: line1 = in_fastq_fp.readline() line2 = in_fastq_fp.readline() line3 = in_fastq_fp.readline() line4 = in_fastq_fp.readline() if not line1: break if not line2: break if not line3: break if not line4: break read_seq = line2.strip() if len(read_seq) < min_read_length or len(read_seq) > max_read_length: num_skipped_reads += 1 continue read_qual = line4.strip() left_tail_seq = read_seq[0:read_tail_length] left_tail_qual = read_qual[0:read_tail_length] right_tail_seq = tk.rev_comp(read_seq[-read_tail_length:]) right_tail_qual = ''.join(reversed(read_qual[-read_tail_length:])) fq_left_tail_fp.write(line1) fq_left_tail_fp.write(left_tail_seq + '\n') fq_left_tail_fp.write(line3) fq_left_tail_fp.write(left_tail_qual + '\n') fq_right_tail_fp.write(line1) fq_right_tail_fp.write(right_tail_seq + '\n') fq_right_tail_fp.write(line3) fq_right_tail_fp.write(right_tail_qual + '\n') num_processd_reads += 1 if num_processd_reads % 100000 == 0: tk.eprint('processed %d reads' % num_processd_reads) in_fastq_fp.close() fq_left_tail_fp.close() fq_right_tail_fp.close() tk.eprint( 'NOTICE: finished extracting tail sequences from fastq. number of skipped reads = %d' % num_skipped_reads) return
def extract_confident_reads_from_sam(in_sam_file, barcode_length, barcode_plus_seq_to_barcode_idx_dict, read_barcode_idx_dict): min_mapq = 20 in_sam_fp = open(in_sam_file, 'r') num_error_alignments = 0 num_aligned_reads = 0 num_unmapped_reads = 0 while 1: line = in_sam_fp.readline() if not line: break if line[0] == '@': continue line = line.strip().split('\t') if len(line) < 6: num_error_alignments += 1 continue readname, flag, contig, left_pos, mapq = line[0:5] flag = int(flag) if flag & 4: num_unmapped_reads += 1 continue if flag & 256 or flag & 1024 or flag & 2048: continue num_aligned_reads += 1 mapq = int(mapq) if mapq < min_mapq: continue left_pos = int(left_pos) if left_pos >= barcode_length: continue if contig in barcode_plus_seq_to_barcode_idx_dict: barcode_idx = barcode_plus_seq_to_barcode_idx_dict[contig] else: tk.eprint('ERROR!! unknown template name in sam: %s' % contig) num_error_alignments += 1 continue read_barcode_idx_dict[readname] = barcode_idx in_sam_fp.close() tk.eprint( 'STATISTICS: sam_file = %s, num_aligned_reads = %d, num_unmapped_reads = %d, num_of_confident_reads = %d' % (in_sam_file, num_aligned_reads, num_unmapped_reads, len(read_barcode_idx_dict))) return
def preprocessing_input_files(in_fq, in_fq_list, tmp_out_prefix): tk.eprint('NOTICE: preprocessing the input fastq file') raw_input_fq_list = list() if in_fq != '': in_fq = os.path.abspath(in_fq) raw_input_fq_list.append(in_fq) if in_fq_list != '': raw_input_fq_list = tk.read_list_file(in_fq_list, abspath=True) fastq_file_list = tk.split_fastq( raw_input_fq_list, 1, tmp_out_prefix) # 1. split 2. remove duplicates in_fastq_file = fastq_file_list[0] return in_fastq_file
def demultiplex1barcode(thread_id, in_fastq_file_list, barcode_info, minimap2, tmp_out_prefix): in_fastq_file = in_fastq_file_list[thread_id] read_tail_length = len(barcode_info.upstream_seq + barcode_info.barcode_list[0]) + min( barcode_info.anchor_seq_len, len(barcode_info.downstream_seq)) read_tail_length = int(read_tail_length * 1.5) tmp_out_prefix += '.thread%d' % thread_id left_tail_fastq_file = tmp_out_prefix + '.left%dbp_tail.fastq' % ( read_tail_length) right_tail_fastq_file = tmp_out_prefix + '.right%dbp_tail.fastq' % ( read_tail_length) tk.eprint('NOTICE: (process %d) extracting tails from fastq reads' % thread_id) tk.extract_fastq_tail_seq(in_fastq_file, read_tail_length, left_tail_fastq_file, right_tail_fastq_file) tk.eprint('NOTICE: (process %d) locating anchors' % thread_id) left_tail_upstream_anchor_paf_file = align_reads_to_anchors( thread_id, minimap2, 1, barcode_info, left_tail_fastq_file, tmp_out_prefix) right_tail_upstream_anchor_paf_file = align_reads_to_anchors( thread_id, minimap2, 1, barcode_info, right_tail_fastq_file, tmp_out_prefix) upstream_anchor_avg_alignments = count_average_num_alignments( left_tail_upstream_anchor_paf_file) anchor_loc = 'none' if upstream_anchor_avg_alignments < 1.5 and len( barcode_info.upstream_seq) > 4: anchor_loc = 'upstream' if anchor_loc == 'upstream': demultiplex1barcode_method2(thread_id, left_tail_fastq_file, right_tail_fastq_file, minimap2, barcode_info, left_tail_upstream_anchor_paf_file, right_tail_upstream_anchor_paf_file, anchor_loc, tmp_out_prefix) else: if upstream_anchor_avg_alignments > 1.5 and len( barcode_info.upstream_seq) > 0: tk.eprint( 'WARNING: The UPSTREAM_SEQ (%s) have multiple alignments in reads! Try to supply a longer sequence!' % barcode_info.upstream_seq) tk.eprint( 'WARNING: AmpRepeat will try to demultiplex the reads without unique anchor sequence' ) demultiplex1barcode_method1(thread_id, left_tail_fastq_file, right_tail_fastq_file, minimap2, barcode_info, tmp_out_prefix) return
def calculate_barcode_mismatch(self, barcode_start_pos, barcode_end_pos): cigar_opr_list, cigar_opr_len_list = tk.analysis_cigar_string( self.cigar) if len(cigar_opr_list) != len(cigar_opr_len_list): tk.eprint('ERROR: len(cigar_opr_list) != len(cigar_opr_len_list)') sys.exit() current_ref_pos = self.target_start for i in range(0, len(cigar_opr_list)): cigar_opr = cigar_opr_list[i] cigar_opr_len = cigar_opr_len_list[i] if cigar_opr == '=': # match current_ref_pos += cigar_opr_len elif cigar_opr == 'X': # mismatch overlap_len = tk.compute_overlap_len( current_ref_pos, current_ref_pos + cigar_opr_len, barcode_start_pos, barcode_end_pos) if overlap_len > 0: self.num_mismatch += overlap_len current_ref_pos += cigar_opr_len elif cigar_opr == 'I': # insertion if current_ref_pos > barcode_start_pos and current_ref_pos < barcode_end_pos - 1: self.num_ins += cigar_opr_len elif cigar_opr == 'D': # deletion overlap_len = tk.compute_overlap_len( current_ref_pos, current_ref_pos + cigar_opr_len, barcode_start_pos, barcode_end_pos) if overlap_len > 0: self.num_del += overlap_len current_ref_pos += cigar_opr_len elif cigar_opr == 'S': continue else: tk.eprint('ERROR: unsupported cigar operation: %s' % cigar_opr) sys.exit() if self.target_end < barcode_end_pos: self.num_mismatch += barcode_end_pos - self.target_end if self.target_start > barcode_start_pos: self.num_mismatch += self.target_start - barcode_start_pos self.num_edit_bases = self.num_ins + self.num_del + self.num_mismatch self.calculate_total_num_mismatches() return
def analysis_of_anchor_paf(anchor_paf_file, barcode_len, anchor_loc, flank_len): barcode_position_dict = dict() anchor_paf_fp = open(anchor_paf_file, 'r') while 1: line = anchor_paf_fp.readline() if not line: break col_list = line.strip().split('\t') if len(col_list) < 12: tk.eprint( 'ERROR: There should be at least 12 columns in the PAF file: %s' % anchor_paf_file) sys.exit() readname = col_list[0] read_len = int(col_list[1]) read_start = int(col_list[2]) read_end = int(col_list[3]) target_len = int(col_list[6]) target_start = int(col_list[7]) target_end = int(col_list[8]) if anchor_loc == 'upstream' and target_end < target_len - 2: continue if anchor_loc == 'downstream' and target_start > 2: continue if anchor_loc == 'upstream': barcode_start = read_end - flank_len barcode_end = barcode_start + barcode_len + flank_len * 2 + target_len - target_end elif anchor_loc == 'downstream': barcode_end = read_start + flank_len barcode_start = barcode_end - barcode_len - flank_len * 2 - target_start if barcode_start < 0: barcode_start = 0 if barcode_end > read_len: barcode_end = read_len if readname not in barcode_position_dict: barcode_position_dict[readname] = (barcode_start, barcode_end) anchor_paf_fp.close() return barcode_position_dict
def read_barcode_list_file(self): tk.eprint('NOTICE: reading barcodes from BARCODE_LIST file: %s' % (self.barcode_list_file)) self.barcode_list = list() barcode_fp = open(self.barcode_list_file, 'r') lines = list(barcode_fp) barcode_fp.close() for line in lines: if line[0] == '>': continue barcode = line.strip().split()[0] self.barcode_list.append(barcode) if len(self.barcode_list) == 0: tk.eprint( 'ERROR: No barcodes were found in the BARCODE_LIST file: %s' % self.barcode_list_file) sys.exit() self.barcode_list = list(set(self.barcode_list)) tk.eprint( 'NOTICE: %d barcodes were found in the BARCODE_LIST file: %s' % (len(self.barcode_list), self.barcode_list_file)) return
def main(): input_args = parse_user_arguments() if input_args.num_threads < 1: tk.eprint('ERROR: --num_threads should be a positive number.') sys.exit() if input_args.in_fq == '' and input_args.in_fq_list == '': tk.eprint( 'ERROR! No input file! Both --in_fq and in_fq_list were not supplied. ' ) sys.exit() if input_args.in_fq != '' and input_args.in_fq_list != '': tk.eprint( 'ERROR! --in_fq and --in_fq_list should not be supplied at the same time.' ) sys.exit() if input_args.minimap2 != 'minimap2': tk.check_input_file_exists(input_args.minimap2) input_args.minimap2 = os.path.abspath(input_args.minimap2) tk.check_input_file_exists(input_args.barcode_list) input_args.barcode_list == os.path.abspath(input_args.barcode_list) input_args.out_prefix = os.path.abspath(input_args.out_prefix) AmpliconBinner_10X(input_args)
def demultiplex1barcode_method2(thread_id, left_tail_fastq_file, right_tail_fastq_file, minimap2, barcode_info, left_tail_anchor_paf_file, right_tail_anchor_paf_file, anchor_loc, tmp_out_prefix): tk.eprint( 'NOTICE: (process %d) demultiplexing using with anchor sequences' % thread_id) barcode_len = len(barcode_info.barcode_list[0]) flank_len = 4 left_tail_barcode_position_dict = analysis_of_anchor_paf( left_tail_anchor_paf_file, barcode_len, anchor_loc, flank_len) right_tail_barcode_position_dict = analysis_of_anchor_paf( right_tail_anchor_paf_file, barcode_len, anchor_loc, flank_len) left_tail_barcode_candidate_fastq_file = tmp_out_prefix + '.left_tail_barcode_candidate.fastq' right_tail_barcode_candidate_fastq_file = tmp_out_prefix + '.right_tail_barcode_candidate.fastq' extract_region_from_fastq(left_tail_fastq_file, left_tail_barcode_position_dict, left_tail_barcode_candidate_fastq_file) extract_region_from_fastq(right_tail_fastq_file, right_tail_barcode_position_dict, right_tail_barcode_candidate_fastq_file) barcode_template_file = tmp_out_prefix + '.barcode_with_anchor.fasta' generate_barcode_template_file(barcode_info, barcode_template_file, anchor_loc, flank_len) left_tail_barcode_compare_paf = tmp_out_prefix + '.left_tail_barcode_compare.paf' right_tail_barcode_compare_paf = tmp_out_prefix + '.right_tail_barcode_compare.paf' tk.eprint('NOTICE: (process %d) aligning barcodes' % thread_id) barcode_compare_para = ' -t 1 --for-only --eqx -c --cs -N 200 -k 5 -w 3 -n 1 -m 10 -s 40 -A 4 -x map-ont ' tk.minimap2_align(left_tail_barcode_candidate_fastq_file, barcode_template_file, minimap2, barcode_compare_para, left_tail_barcode_compare_paf) tk.minimap2_align(right_tail_barcode_candidate_fastq_file, barcode_template_file, minimap2, barcode_compare_para, right_tail_barcode_compare_paf) tk.eprint('NOTICE: (process %d) assigning reads to barcodes' % thread_id) if anchor_loc == 'upstream': barcode_start_pos = flank_len elif anchor_loc == 'downstream': barcode_start_pos = 0 read_barcode_info_dict = assign_reads_to_barcodes( thread_id, barcode_start_pos, barcode_start_pos + barcode_len, left_tail_barcode_compare_paf, right_tail_barcode_compare_paf) output_summary(barcode_info, read_barcode_info_dict, tmp_out_prefix) return
def main(): input_args = parse_user_arguments() if input_args.num_threads < 1: tk.eprint('ERROR: `--num_threads` should be a positive number.') sys.exit() if input_args.in_fq == '' and input_args.in_fq_list == '': tk.eprint( 'ERROR! No input file! Both `--in_fq` and in_fq_list were not supplied. ' ) sys.exit() if input_args.in_fq != '' and input_args.in_fq_list != '': tk.eprint( 'ERROR! `--in_fq` and `--in_fq_list` should not be supplied at the same time.' ) sys.exit() if input_args.fwd_barcode_fasta == '' and input_args.rev_barcode_fasta == '': tk.eprint( 'ERROR! Both `--fwd_barcode_fasta` and `--rev_barcode_fasta` are not supplied.' ) sys.exit() if input_args.minimap2 != 'minimap2': input_args.minimap2 = os.path.abspath(input_args.minimap2) input_args.out_dir = os.path.abspath(input_args.out_dir) if '/' in input_args.exp_name or '\\' in input_args.exp_name: tk.eprint( '''ERROR! `--exp_name` should not have special characters such as '/' or '\\'.''' ) sys.exit() AmpliconBinner(input_args) return
def demultiplex1barcode_method1(thread_id, left_tail_fastq_file, right_tail_fastq_file, minimap2, barcode_info, tmp_out_prefix): barcode_template_file = tmp_out_prefix + '.barcode_with_anchor.fasta' left_tail_barcode_compare_paf = tmp_out_prefix + '.left_tail_barcode_compare.paf' right_tail_barcode_compare_paf = tmp_out_prefix + '.right_tail_barcode_compare.paf' barcode_len = len(barcode_info.barcode_list[0]) generate_barcode_template_file(barcode_info, barcode_template_file, 'none', 0) target_seq_len = len(barcode_info.upstream_seq) + barcode_len + len( barcode_info.downstream_seq) short_para = ' -k 3 -w 2 -n 1 -m 10 -s 40 ' mid_para = ' -k 5 -w 3 -n 1 -m 10 -s 40 ' normal_para = ' ' general_para = ' -x map-ont -t 1 --for-only --eqx -c --cs -N 200 -K 1M ' short_para += general_para mid_para += general_para normal_para += general_para if target_seq_len < 25: para = short_para elif target_seq_len < 50: para = mid_para else: para = normal_para tk.eprint('NOTICE: aligning reads to barcodes with anchors') tk.minimap2_align(left_tail_fastq_file, barcode_template_file, minimap2, para, left_tail_barcode_compare_paf) tk.minimap2_align(right_tail_fastq_file, barcode_template_file, minimap2, para, right_tail_barcode_compare_paf) tk.eprint('NOTICE: (process %d) assigning reads to barcodes' % thread_id) barcode_start_pos = len(barcode_info.upstream_seq) tk.eprint('DEBUG: barcode_start_pos = %d' % barcode_start_pos) read_barcode_info_dict = assign_reads_to_barcodes( thread_id, barcode_start_pos, barcode_start_pos + barcode_len, left_tail_barcode_compare_paf, right_tail_barcode_compare_paf) output_summary(barcode_info, read_barcode_info_dict, tmp_out_prefix) return
def AmpliconBinner_10X(input_args): tmp_out_prefix = input_args.out_prefix + '.tmp' num_threads = input_args.num_threads minimap2 = input_args.minimap2 barcode_info = BarcodeInfo() barcode_info.init(input_args.barcode_list, input_args.barcode_upstream_seq) raw_input_fq_list = list() if input_args.in_fq != '': input_args.in_fq = os.path.abspath(input_args.in_fq) raw_input_fq_list.append(input_args.in_fq) if input_args.in_fq_list != '': raw_input_fq_list = tk.read_list_file(input_args.in_fq_list, abspath=True) tk.eprint('NOTICE: preprocessing the input fastq file') fastq_file_list = tk.split_fastq( raw_input_fq_list, num_threads, tmp_out_prefix) # 1. split 2. remove duplicates process_list = list() for i in range(0, num_threads): p = Process(target=demultiplex1barcode, args=(i, fastq_file_list, barcode_info, minimap2, tmp_out_prefix)) process_list.append(p) for p in process_list: p.start() for p in process_list: p.join() merge_thread_summary_file(num_threads, input_args.out_prefix) cmd = 'rm %s*' % (tmp_out_prefix) ret = os.system(cmd) if ret != 0: tk.eprint('ERROR: Failed to run command: %s' % cmd) tk.eprint('Return value is: %d' % ret) sys.exit() return
def merge_thread_summary_file(num_threads, out_prefix): out_file_list = list() out_stat_file_list = list() out_all_read_barcode_file_list = list() tmp_out_prefix = out_prefix + '.tmp' for i in range(0, num_threads): out_file = tmp_out_prefix + '.thread%d' % i + '.demultiplexing.reads.txt' out_stat_file = tmp_out_prefix + '.thread%d' % i + '.demultiplexing.statistics.txt' out_all_read_barcode_file = tmp_out_prefix + '.thread%d' % i + '.all_reads.txt' out_file_list.append(out_file) out_stat_file_list.append(out_stat_file) out_all_read_barcode_file_list.append(out_all_read_barcode_file) final_out_file = out_prefix + '.demultiplexing.PASS.reads.txt' final_out_stat_file = out_prefix + '.demultiplexing.statistics.txt' final_all_read_barcode_file = out_prefix + '.all_reads.txt' header = '#readname\tbest_matched_barcode\tnum_edit_bases\tmismatch|insertion|deletion\tstrand\tsecond_best_matched_barcode\tnum_edit_bases\tmismatch|insertion|deletion\tstrand\n' final_out_fp = open(final_out_file, 'w') final_out_fp.write(header) final_out_fp.close() final_all_read_barcode_fp = open(final_all_read_barcode_file, 'w') final_all_read_barcode_fp.write(header) final_all_read_barcode_fp.close() cmd = 'cat ' for f in out_file_list: cmd += ' %s ' % f cmd += ' >> %s' % final_out_file ret = os.system(cmd) if ret != 0: tk.eprint('ERROR: Failed to run command: %s' % cmd) tk.eprint('Return value is: %d' % ret) sys.exit() cmd = 'cat ' for f in out_all_read_barcode_file_list: cmd += ' %s ' % f cmd += ' > %s' % final_all_read_barcode_file ret = os.system(cmd) if ret != 0: tk.eprint('ERROR: Failed to run command: %s' % cmd) tk.eprint('Return value is: %d' % ret) sys.exit() barcode_count_dict = dict() for stat_file in out_stat_file_list: stat_fp = open(stat_file, 'r') lines = list(stat_fp) stat_fp.close() for line in lines: col_list = line.strip().split('\t') barcode = col_list[0] count = int(col_list[1]) if barcode not in barcode_count_dict: barcode_count_dict[barcode] = count else: barcode_count_dict[barcode] += count barcode_count_sorted_list = sorted(barcode_count_dict.items(), key=lambda x: x[1], reverse=True) final_out_stat_fp = open(final_out_stat_file, 'w') final_out_stat_fp.write('#cellular_barcode_seq\tnum_reads\n') for x in barcode_count_sorted_list: final_out_stat_fp.write('%s\t%d\n' % (x[0], x[1])) final_out_stat_fp.close() return
def assign_reads_to_barcodes(thread_id, barcode_start_pos, barcode_end_pos, left_tail_paf_file, right_tail_paf_file): read_barcode_info_dict = dict() max_num_align_retain = 3 left_tail_paf_fp = open(left_tail_paf_file, 'r') while 1: line = left_tail_paf_fp.readline() if not line: break col_list = line.strip().split('\t') readname = col_list[0] align_info = AlignmentInfo() try: align_info.target_len = int(col_list[6]) except: tk.eprint('ERROR! file is: %s' % left_tail_paf_file) sys.exit() align_info.target_start = int(col_list[7]) align_info.target_end = int(col_list[8]) align_info.barcode = col_list[5] align_info.strand = 1 align_info.mapq = int(col_list[11]) for col in col_list[12:]: if col[0:5] == 'AS:i:': align_info.score = int(col[5:]) if col[0:5] == 'cg:Z:': align_info.cigar = col[5:] if align_info.score > 0 and len(align_info.cigar) > 0: break align_info.calculate_barcode_mismatch(barcode_start_pos, barcode_end_pos) if readname not in read_barcode_info_dict: read_barcode_info_dict[readname] = list() read_barcode_info_dict[readname].append(align_info) read_barcode_info_dict[readname].sort( key=lambda align_info: align_info.total_num_edit_bases) read_barcode_info_dict[readname] = read_barcode_info_dict[readname][ 0:max_num_align_retain] left_tail_paf_fp.close() right_tail_paf_fp = open(right_tail_paf_file, 'r') while 1: line = right_tail_paf_fp.readline() if not line: break col_list = line.strip().split('\t') readname = col_list[0] align_info = AlignmentInfo() try: align_info.target_len = int(col_list[6]) except: tk.eprint('ERROR! file is: %s' % right_tail_paf_file) sys.exit() align_info.target_start = int(col_list[7]) align_info.target_end = int(col_list[8]) align_info.barcode = col_list[5] align_info.strand = -1 align_info.mapq = int(col_list[11]) for col in col_list[12:]: if col[0:5] == 'AS:i:': align_info.score = int(col[5:]) if col[0:5] == 'cg:Z:': align_info.cigar = col[5:] if align_info.score > 0 and len(align_info.cigar) > 0: break align_info.calculate_barcode_mismatch(barcode_start_pos, barcode_end_pos) if readname not in read_barcode_info_dict: read_barcode_info_dict[readname] = list() read_barcode_info_dict[readname].append(align_info) read_barcode_info_dict[readname].sort( key=lambda align_info: align_info.total_num_edit_bases) read_barcode_info_dict[readname] = read_barcode_info_dict[readname][ 0:max_num_align_retain] right_tail_paf_fp.close() for readname in read_barcode_info_dict: align_info_list = read_barcode_info_dict[readname] align_info_list.sort( key=lambda align_info: align_info.total_num_edit_bases) return read_barcode_info_dict
def demultiplex1side(barcode_info, in_fastq_file, minimap2, n_threads, out_prefix): tmp_out_prefix = out_prefix + '.tmp.%s' % barcode_info.side read_tail_add_length = 256 barcode_length = len(barcode_info.barcode_seq_list[0]) barcode_plus_seq_file = tmp_out_prefix + '.barcode_plus%dbp.fasta' % barcode_info.anchor_seq_len generate_barcode_plus_tail_file(barcode_info, barcode_plus_seq_file) min_read_length = int(len(barcode_info.amplicon_seq) * 0.667) max_read_length = int( (len(barcode_info.amplicon_seq) + barcode_length * 2) * 1.5) tk.eprint('NOTICE: length of amplicon is %d' % len(barcode_info.amplicon_seq)) tk.eprint('NOTICE: reads shorter than %d bp would be skipped' % min_read_length) tk.eprint('NOTICE: reads longer than %d bp would be skipped' % max_read_length) read_tail_length = barcode_length + barcode_info.anchor_seq_len + read_tail_add_length left_tail_fastq_file = '%s.left%dbp_tail.fastq' % (tmp_out_prefix, read_tail_length) right_tail_fastq_file = '%s.right%dbp_tail.fastq' % (tmp_out_prefix, read_tail_length) extract_fastq_tail_seq(in_fastq_file, read_tail_length, min_read_length, max_read_length, left_tail_fastq_file, right_tail_fastq_file) left_tail_sam_file = '%s.left%dbp_tail.sam' % (tmp_out_prefix, read_tail_length) right_tail_sam_file = '%s.right%dbp_tail.sam' % (tmp_out_prefix, read_tail_length) cmd = '%s -N 400 --cs -t %d -a -x map-ont %s %s > %s 2> /dev/null' % ( minimap2, n_threads, barcode_plus_seq_file, left_tail_fastq_file, left_tail_sam_file) tk.run_system_cmd(cmd) cmd = '%s -N 400 --cs -t %d -a -x map-ont %s %s > %s 2> /dev/null' % ( minimap2, n_threads, barcode_plus_seq_file, right_tail_fastq_file, right_tail_sam_file) tk.run_system_cmd(cmd) barcode_info.read_barcode_idx_dict = dict( ) # read_barcode_idx_dict[readname] = barcode_idx extract_confident_reads_from_sam( left_tail_sam_file, barcode_length, barcode_info.barcode_plus_seq_to_barcode_idx_dict, barcode_info.read_barcode_idx_dict) extract_confident_reads_from_sam( right_tail_sam_file, barcode_length, barcode_info.barcode_plus_seq_to_barcode_idx_dict, barcode_info.read_barcode_idx_dict) cmd = 'rm %s*' % tmp_out_prefix tk.run_system_cmd(cmd) return
def output_binned_reads_both1(in_fastq_file, fwd_barcode_info, rev_barcode_info, out_prefix): readname_to_barcode_file = out_prefix + '.readname_to_barcode.txt' summary_file = out_prefix + '.summary.txt' readname_to_sample_idx_dict = dict() barcode_read_count_dict = dict() discordant_readname_set = set() ## fwd_barcode ## for readname in fwd_barcode_info.read_barcode_idx_dict: fwd_barcode_idx = fwd_barcode_info.read_barcode_idx_dict[readname] if readname in rev_barcode_info.read_barcode_idx_dict: rev_barcode_idx = rev_barcode_info.read_barcode_idx_dict[readname] if fwd_barcode_idx != rev_barcode_idx: discordant_readname_set.add(readname) continue if readname in readname_to_sample_idx_dict and readname_to_sample_idx_dict[ readname] != fwd_barcode_idx: discordant_readname_set.add(readname) continue readname_to_sample_idx_dict[readname] = fwd_barcode_idx ## fwd_barcode ## for readname in rev_barcode_info.read_barcode_idx_dict: rev_barcode_idx = rev_barcode_info.read_barcode_idx_dict[readname] if readname in fwd_barcode_info.read_barcode_idx_dict: continue if readname in readname_to_sample_idx_dict and readname_to_sample_idx_dict[ readname] != rev_barcode_idx: discordant_readname_set.add(readname) continue readname_to_sample_idx_dict[readname] = rev_barcode_idx tk.eprint('WARNING: %d reads have discordant barcodes on the two ends' % len(discordant_readname_set)) readname_to_barcode_name_list = list() for readname in readname_to_sample_idx_dict: barcode_idx = readname_to_sample_idx_dict[readname] barcode_name = fwd_barcode_info.barcode_name_list[barcode_idx] if barcode_name not in barcode_read_count_dict: barcode_read_count_dict[barcode_name] = 1 else: barcode_read_count_dict[barcode_name] += 1 readname_to_barcode_name_list.append( (barcode_idx, readname, barcode_name)) readname_to_barcode_name_list.sort(key=lambda x: x[0]) sorted_barcode_read_count_list = sorted(barcode_read_count_dict.items(), key=lambda x: x[1], reverse=True) readname_to_barcode_fp = open(readname_to_barcode_file, 'w') readname_to_barcode_fp.write('#readname\tbarcode_name\n') for x in readname_to_barcode_name_list: readname_to_barcode_fp.write('%s\t%s\n' % (x[1], x[2])) readname_to_barcode_fp.close() summary_fp = open(summary_file, 'w') summary_fp.write('#barcode_name\tnum_reads\n') for x in sorted_barcode_read_count_list: summary_fp.write('%s\t%d\n' % (x[0], x[1])) summary_fp.close() out_fastq_file_list = list() for i in range(0, len(fwd_barcode_info.barcode_name_list)): out_fastq_file = out_prefix + '.%s.fastq' % fwd_barcode_info.barcode_name_list[ i] out_fastq_file_list.append(out_fastq_file) output_binned_fastq(in_fastq_file, readname_to_sample_idx_dict, out_fastq_file_list) return out_fastq_file_list
def AmpliconBinner(input_args): minimap2 = input_args.minimap2 amplicon_seq_fasta_file = input_args.amp_seq_fasta n_threads = input_args.num_threads out_prefix = os.path.join(input_args.out_dir, input_args.exp_name) tmp_out_prefix = out_prefix + '.tmp' tk.create_dir(input_args.out_dir) in_fastq_file = preprocessing_input_files(input_args.in_fq, input_args.in_fq_list, tmp_out_prefix) fwd_barcode_info = BarcodeInfo() rev_barcode_info = BarcodeInfo() fwd_barcode_out_prefix = os.path.join(input_args.out_dir, input_args.exp_name) + '.fwd' rev_barcode_out_prefix = os.path.join(input_args.out_dir, input_args.exp_name) + '.rev' if input_args.fwd_barcode_fasta != '' and input_args.rev_barcode_fasta == '': mode = 'fwd_only' elif input_args.fwd_barcode_fasta == '' and input_args.rev_barcode_fasta != '': mode = 'rev_only' elif input_args.fwd_barcode_fasta != '' and input_args.rev_barcode_fasta != '': if input_args.require_two_barcodes: mode = 'both2' else: mode = 'both1' if input_args.fwd_barcode_fasta != '': fwd_barcode_info.init_from_file(input_args.fwd_barcode_fasta, amplicon_seq_fasta_file, 'fwd') demultiplex1side(fwd_barcode_info, in_fastq_file, minimap2, n_threads, out_prefix) if mode == 'fwd_only': out_fastq_file_list = output_binned_reads_for1side( in_fastq_file, fwd_barcode_info, fwd_barcode_out_prefix) remove_empty_out_fastq_file(out_fastq_file_list) if input_args.rev_barcode_fasta != '': rev_barcode_info.init_from_file(input_args.rev_barcode_fasta, amplicon_seq_fasta_file, 'rev') demultiplex1side(rev_barcode_info, in_fastq_file, minimap2, n_threads, out_prefix) if mode == 'rev_only': out_fastq_file_list = output_binned_reads_for1side( in_fastq_file, rev_barcode_info, rev_barcode_out_prefix) remove_empty_out_fastq_file(out_fastq_file_list) if mode == 'both1': fwd_rev_unmatch = check_fwd_rev_barcodes(fwd_barcode_info, rev_barcode_info) if fwd_rev_unmatch: tk.eprint( '''ERROR! fwd barcodes and rev barcodes are different. Please supply the SAME barcode.fasta file if you have barcodes on both fwd and rev primers but only require barcode matching on one end.''' ) tk.eprint( '''NOTICE: If you do have DIFFERENT barcodes on both ends and want to bin the reads for each side separately, you can run ampliconBinner twice and supply either '--fwd_barcode_fasta' or '--fwd_barcode_fasta' once a time.''' ) tk.eprint( '''NOTICE: If you have barcodes on both ends and want to require barcode matching on both ends, please supply '--require_two_barcodes'. ''' ) sys.exit(1) both1_out_prefix = os.path.join( input_args.out_dir, input_args.exp_name) + '.require1barcode' out_fastq_file_list = output_binned_reads_both1( in_fastq_file, fwd_barcode_info, rev_barcode_info, both1_out_prefix) remove_empty_out_fastq_file(out_fastq_file_list) elif mode == 'both2': both2_out_prefix = os.path.join( input_args.out_dir, input_args.exp_name) + '.require2barcodes' out_fastq_file_list = output_binned_reads_both2( in_fastq_file, fwd_barcode_info, rev_barcode_info, both2_out_prefix) remove_empty_out_fastq_file(out_fastq_file_list) cmd = 'rm %s*' % tmp_out_prefix tk.run_system_cmd(cmd) return