コード例 #1
0
    def calculate_total_num_mismatches(self):
        cigar_opr_list, cigar_opr_len_list = tk.analysis_cigar_string(
            self.cigar)
        if len(cigar_opr_list) != len(cigar_opr_len_list):
            tk.eprint('ERROR: len(cigar_opr_list) != len(cigar_opr_len_list)')
            sys.exit()

        self.total_num_edit_bases = 0
        for i in range(0, len(cigar_opr_list)):
            cigar_opr = cigar_opr_list[i]
            cigar_opr_len = cigar_opr_len_list[i]
            if cigar_opr == '=':  # match
                continue
            elif cigar_opr == 'X':  # mismatch
                self.total_num_edit_bases += cigar_opr_len
            elif cigar_opr == 'I':  # insertion
                self.total_num_edit_bases += cigar_opr_len
            elif cigar_opr == 'D':  # deletion
                self.total_num_edit_bases += cigar_opr_len
            elif cigar_opr == 'S':
                continue
            else:
                tk.eprint('ERROR: unsupported cigar operation: %s' % cigar_opr)
                sys.exit()

        self.total_num_edit_bases += self.target_start
        self.total_num_edit_bases += self.target_len - self.target_end

        return
コード例 #2
0
ファイル: ampBinner.py プロジェクト: WGLab/AmpBinner
    def init_from_file(self, barcode_fasta_file, amplicon_seq_fasta_file,
                       barcode_side):

        self.barcode_fasta_file = barcode_fasta_file
        self.amplicon_seq_fasta_file = amplicon_seq_fasta_file
        self.side = barcode_side

        self.barcode_name_list, self.barcode_seq_list = tk.read_fasta_file(
            barcode_fasta_file)
        fasta_name_list, fasta_seq_list = tk.read_fasta_file(
            amplicon_seq_fasta_file)
        if len(fasta_name_list) > 1:
            tk.eprint(
                'ERROR: There are more than 1 sequence in the amp_seq_fasta file: %s'
                % amplicon_seq_fasta_file)
            sys.exit()
        if len(fasta_name_list) == 1 and len(fasta_seq_list) == 1:
            self.amplicon_name = fasta_name_list[0]
            self.amplicon_seq = fasta_seq_list[0]
        else:
            tk.eprint(
                'ERROR: No sequence was found in the amp_seq_fasta file: %s' %
                amplicon_seq_fasta_file)
            sys.exit()

        if barcode_side == 'fwd':
            self.downstream_seq = self.amplicon_seq[0:self.anchor_seq_len]
        elif barcode_side == 'rev':
            self.downstream_seq = tk.rev_comp(
                self.amplicon_seq[-self.anchor_seq_len:])
        return
コード例 #3
0
ファイル: ampBinner.py プロジェクト: WGLab/AmpBinner
def extract_fastq_tail_seq(in_fastq_file, read_tail_length, min_read_length,
                           max_read_length, left_tail_fastq_file,
                           right_tail_fastq_file):

    in_fastq_fp = tk.gzopen(in_fastq_file)

    fq_left_tail_fp = open(left_tail_fastq_file, 'w')
    fq_right_tail_fp = open(right_tail_fastq_file, 'w')

    num_skipped_reads = 0

    num_processd_reads = 0
    while 1:
        line1 = in_fastq_fp.readline()
        line2 = in_fastq_fp.readline()
        line3 = in_fastq_fp.readline()
        line4 = in_fastq_fp.readline()

        if not line1: break
        if not line2: break
        if not line3: break
        if not line4: break

        read_seq = line2.strip()
        if len(read_seq) < min_read_length or len(read_seq) > max_read_length:
            num_skipped_reads += 1
            continue

        read_qual = line4.strip()

        left_tail_seq = read_seq[0:read_tail_length]
        left_tail_qual = read_qual[0:read_tail_length]

        right_tail_seq = tk.rev_comp(read_seq[-read_tail_length:])
        right_tail_qual = ''.join(reversed(read_qual[-read_tail_length:]))

        fq_left_tail_fp.write(line1)
        fq_left_tail_fp.write(left_tail_seq + '\n')
        fq_left_tail_fp.write(line3)
        fq_left_tail_fp.write(left_tail_qual + '\n')

        fq_right_tail_fp.write(line1)
        fq_right_tail_fp.write(right_tail_seq + '\n')
        fq_right_tail_fp.write(line3)
        fq_right_tail_fp.write(right_tail_qual + '\n')

        num_processd_reads += 1

        if num_processd_reads % 100000 == 0:
            tk.eprint('processed %d reads' % num_processd_reads)

    in_fastq_fp.close()
    fq_left_tail_fp.close()
    fq_right_tail_fp.close()

    tk.eprint(
        'NOTICE: finished extracting tail sequences from fastq. number of skipped reads = %d'
        % num_skipped_reads)
    return
コード例 #4
0
ファイル: ampBinner.py プロジェクト: WGLab/AmpBinner
def extract_confident_reads_from_sam(in_sam_file, barcode_length,
                                     barcode_plus_seq_to_barcode_idx_dict,
                                     read_barcode_idx_dict):

    min_mapq = 20

    in_sam_fp = open(in_sam_file, 'r')
    num_error_alignments = 0
    num_aligned_reads = 0
    num_unmapped_reads = 0

    while 1:
        line = in_sam_fp.readline()
        if not line: break
        if line[0] == '@': continue

        line = line.strip().split('\t')
        if len(line) < 6:
            num_error_alignments += 1
            continue
        readname, flag, contig, left_pos, mapq = line[0:5]

        flag = int(flag)
        if flag & 4:
            num_unmapped_reads += 1
            continue

        if flag & 256 or flag & 1024 or flag & 2048: continue

        num_aligned_reads += 1

        mapq = int(mapq)
        if mapq < min_mapq: continue

        left_pos = int(left_pos)
        if left_pos >= barcode_length: continue

        if contig in barcode_plus_seq_to_barcode_idx_dict:
            barcode_idx = barcode_plus_seq_to_barcode_idx_dict[contig]
        else:
            tk.eprint('ERROR!! unknown template name in sam: %s' % contig)
            num_error_alignments += 1
            continue

        read_barcode_idx_dict[readname] = barcode_idx

    in_sam_fp.close()

    tk.eprint(
        'STATISTICS: sam_file = %s, num_aligned_reads = %d, num_unmapped_reads = %d, num_of_confident_reads = %d'
        % (in_sam_file, num_aligned_reads, num_unmapped_reads,
           len(read_barcode_idx_dict)))

    return
コード例 #5
0
ファイル: ampBinner.py プロジェクト: WGLab/AmpBinner
def preprocessing_input_files(in_fq, in_fq_list, tmp_out_prefix):

    tk.eprint('NOTICE: preprocessing the input fastq file')
    raw_input_fq_list = list()
    if in_fq != '':
        in_fq = os.path.abspath(in_fq)
        raw_input_fq_list.append(in_fq)
    if in_fq_list != '':
        raw_input_fq_list = tk.read_list_file(in_fq_list, abspath=True)

    fastq_file_list = tk.split_fastq(
        raw_input_fq_list, 1, tmp_out_prefix)  # 1. split 2. remove duplicates
    in_fastq_file = fastq_file_list[0]
    return in_fastq_file
コード例 #6
0
def demultiplex1barcode(thread_id, in_fastq_file_list, barcode_info, minimap2,
                        tmp_out_prefix):

    in_fastq_file = in_fastq_file_list[thread_id]

    read_tail_length = len(barcode_info.upstream_seq +
                           barcode_info.barcode_list[0]) + min(
                               barcode_info.anchor_seq_len,
                               len(barcode_info.downstream_seq))
    read_tail_length = int(read_tail_length * 1.5)

    tmp_out_prefix += '.thread%d' % thread_id
    left_tail_fastq_file = tmp_out_prefix + '.left%dbp_tail.fastq' % (
        read_tail_length)
    right_tail_fastq_file = tmp_out_prefix + '.right%dbp_tail.fastq' % (
        read_tail_length)

    tk.eprint('NOTICE: (process %d) extracting tails from fastq reads' %
              thread_id)
    tk.extract_fastq_tail_seq(in_fastq_file, read_tail_length,
                              left_tail_fastq_file, right_tail_fastq_file)

    tk.eprint('NOTICE: (process %d) locating anchors' % thread_id)
    left_tail_upstream_anchor_paf_file = align_reads_to_anchors(
        thread_id, minimap2, 1, barcode_info, left_tail_fastq_file,
        tmp_out_prefix)
    right_tail_upstream_anchor_paf_file = align_reads_to_anchors(
        thread_id, minimap2, 1, barcode_info, right_tail_fastq_file,
        tmp_out_prefix)

    upstream_anchor_avg_alignments = count_average_num_alignments(
        left_tail_upstream_anchor_paf_file)

    anchor_loc = 'none'
    if upstream_anchor_avg_alignments < 1.5 and len(
            barcode_info.upstream_seq) > 4:
        anchor_loc = 'upstream'

    if anchor_loc == 'upstream':
        demultiplex1barcode_method2(thread_id, left_tail_fastq_file,
                                    right_tail_fastq_file, minimap2,
                                    barcode_info,
                                    left_tail_upstream_anchor_paf_file,
                                    right_tail_upstream_anchor_paf_file,
                                    anchor_loc, tmp_out_prefix)
    else:
        if upstream_anchor_avg_alignments > 1.5 and len(
                barcode_info.upstream_seq) > 0:
            tk.eprint(
                'WARNING: The UPSTREAM_SEQ (%s) have multiple alignments in reads! Try to supply a longer sequence!'
                % barcode_info.upstream_seq)
            tk.eprint(
                'WARNING: AmpRepeat will try to demultiplex the reads without unique anchor sequence'
            )

        demultiplex1barcode_method1(thread_id, left_tail_fastq_file,
                                    right_tail_fastq_file, minimap2,
                                    barcode_info, tmp_out_prefix)

    return
コード例 #7
0
    def calculate_barcode_mismatch(self, barcode_start_pos, barcode_end_pos):
        cigar_opr_list, cigar_opr_len_list = tk.analysis_cigar_string(
            self.cigar)
        if len(cigar_opr_list) != len(cigar_opr_len_list):
            tk.eprint('ERROR: len(cigar_opr_list) != len(cigar_opr_len_list)')
            sys.exit()

        current_ref_pos = self.target_start

        for i in range(0, len(cigar_opr_list)):
            cigar_opr = cigar_opr_list[i]
            cigar_opr_len = cigar_opr_len_list[i]
            if cigar_opr == '=':  # match
                current_ref_pos += cigar_opr_len
            elif cigar_opr == 'X':  # mismatch
                overlap_len = tk.compute_overlap_len(
                    current_ref_pos, current_ref_pos + cigar_opr_len,
                    barcode_start_pos, barcode_end_pos)
                if overlap_len > 0:
                    self.num_mismatch += overlap_len
                current_ref_pos += cigar_opr_len
            elif cigar_opr == 'I':  # insertion
                if current_ref_pos > barcode_start_pos and current_ref_pos < barcode_end_pos - 1:
                    self.num_ins += cigar_opr_len
            elif cigar_opr == 'D':  # deletion
                overlap_len = tk.compute_overlap_len(
                    current_ref_pos, current_ref_pos + cigar_opr_len,
                    barcode_start_pos, barcode_end_pos)
                if overlap_len > 0:
                    self.num_del += overlap_len
                current_ref_pos += cigar_opr_len
            elif cigar_opr == 'S':
                continue
            else:
                tk.eprint('ERROR: unsupported cigar operation: %s' % cigar_opr)
                sys.exit()
        if self.target_end < barcode_end_pos:
            self.num_mismatch += barcode_end_pos - self.target_end

        if self.target_start > barcode_start_pos:
            self.num_mismatch += self.target_start - barcode_start_pos

        self.num_edit_bases = self.num_ins + self.num_del + self.num_mismatch

        self.calculate_total_num_mismatches()

        return
コード例 #8
0
def analysis_of_anchor_paf(anchor_paf_file, barcode_len, anchor_loc,
                           flank_len):

    barcode_position_dict = dict()
    anchor_paf_fp = open(anchor_paf_file, 'r')

    while 1:
        line = anchor_paf_fp.readline()
        if not line: break
        col_list = line.strip().split('\t')
        if len(col_list) < 12:
            tk.eprint(
                'ERROR: There should be at least 12 columns in the PAF file: %s'
                % anchor_paf_file)
            sys.exit()

        readname = col_list[0]
        read_len = int(col_list[1])
        read_start = int(col_list[2])
        read_end = int(col_list[3])

        target_len = int(col_list[6])
        target_start = int(col_list[7])
        target_end = int(col_list[8])

        if anchor_loc == 'upstream' and target_end < target_len - 2: continue
        if anchor_loc == 'downstream' and target_start > 2: continue

        if anchor_loc == 'upstream':
            barcode_start = read_end - flank_len
            barcode_end = barcode_start + barcode_len + flank_len * 2 + target_len - target_end
        elif anchor_loc == 'downstream':
            barcode_end = read_start + flank_len
            barcode_start = barcode_end - barcode_len - flank_len * 2 - target_start

        if barcode_start < 0: barcode_start = 0
        if barcode_end > read_len: barcode_end = read_len

        if readname not in barcode_position_dict:
            barcode_position_dict[readname] = (barcode_start, barcode_end)

    anchor_paf_fp.close()

    return barcode_position_dict
コード例 #9
0
    def read_barcode_list_file(self):

        tk.eprint('NOTICE: reading barcodes from BARCODE_LIST file: %s' %
                  (self.barcode_list_file))
        self.barcode_list = list()
        barcode_fp = open(self.barcode_list_file, 'r')
        lines = list(barcode_fp)
        barcode_fp.close()

        for line in lines:
            if line[0] == '>': continue
            barcode = line.strip().split()[0]
            self.barcode_list.append(barcode)

        if len(self.barcode_list) == 0:
            tk.eprint(
                'ERROR: No barcodes were found in the BARCODE_LIST file: %s' %
                self.barcode_list_file)
            sys.exit()

        self.barcode_list = list(set(self.barcode_list))
        tk.eprint(
            'NOTICE: %d barcodes were found in the BARCODE_LIST file: %s' %
            (len(self.barcode_list), self.barcode_list_file))
        return
コード例 #10
0
def main():

    input_args = parse_user_arguments()

    if input_args.num_threads < 1:
        tk.eprint('ERROR: --num_threads should be a positive number.')
        sys.exit()
    if input_args.in_fq == '' and input_args.in_fq_list == '':
        tk.eprint(
            'ERROR! No input file! Both --in_fq and in_fq_list were not supplied. '
        )
        sys.exit()
    if input_args.in_fq != '' and input_args.in_fq_list != '':
        tk.eprint(
            'ERROR! --in_fq and --in_fq_list should not be supplied at the same time.'
        )
        sys.exit()

    if input_args.minimap2 != 'minimap2':
        tk.check_input_file_exists(input_args.minimap2)
        input_args.minimap2 = os.path.abspath(input_args.minimap2)

    tk.check_input_file_exists(input_args.barcode_list)
    input_args.barcode_list == os.path.abspath(input_args.barcode_list)
    input_args.out_prefix = os.path.abspath(input_args.out_prefix)

    AmpliconBinner_10X(input_args)
コード例 #11
0
def demultiplex1barcode_method2(thread_id, left_tail_fastq_file,
                                right_tail_fastq_file, minimap2, barcode_info,
                                left_tail_anchor_paf_file,
                                right_tail_anchor_paf_file, anchor_loc,
                                tmp_out_prefix):

    tk.eprint(
        'NOTICE: (process %d) demultiplexing using with anchor sequences' %
        thread_id)
    barcode_len = len(barcode_info.barcode_list[0])
    flank_len = 4

    left_tail_barcode_position_dict = analysis_of_anchor_paf(
        left_tail_anchor_paf_file, barcode_len, anchor_loc, flank_len)
    right_tail_barcode_position_dict = analysis_of_anchor_paf(
        right_tail_anchor_paf_file, barcode_len, anchor_loc, flank_len)

    left_tail_barcode_candidate_fastq_file = tmp_out_prefix + '.left_tail_barcode_candidate.fastq'
    right_tail_barcode_candidate_fastq_file = tmp_out_prefix + '.right_tail_barcode_candidate.fastq'

    extract_region_from_fastq(left_tail_fastq_file,
                              left_tail_barcode_position_dict,
                              left_tail_barcode_candidate_fastq_file)
    extract_region_from_fastq(right_tail_fastq_file,
                              right_tail_barcode_position_dict,
                              right_tail_barcode_candidate_fastq_file)

    barcode_template_file = tmp_out_prefix + '.barcode_with_anchor.fasta'

    generate_barcode_template_file(barcode_info, barcode_template_file,
                                   anchor_loc, flank_len)

    left_tail_barcode_compare_paf = tmp_out_prefix + '.left_tail_barcode_compare.paf'
    right_tail_barcode_compare_paf = tmp_out_prefix + '.right_tail_barcode_compare.paf'

    tk.eprint('NOTICE: (process %d) aligning barcodes' % thread_id)

    barcode_compare_para = ' -t 1 --for-only --eqx -c --cs -N 200 -k 5 -w 3 -n 1 -m 10 -s 40 -A 4 -x map-ont '

    tk.minimap2_align(left_tail_barcode_candidate_fastq_file,
                      barcode_template_file, minimap2, barcode_compare_para,
                      left_tail_barcode_compare_paf)
    tk.minimap2_align(right_tail_barcode_candidate_fastq_file,
                      barcode_template_file, minimap2, barcode_compare_para,
                      right_tail_barcode_compare_paf)

    tk.eprint('NOTICE: (process %d) assigning reads to barcodes' % thread_id)
    if anchor_loc == 'upstream':
        barcode_start_pos = flank_len
    elif anchor_loc == 'downstream':
        barcode_start_pos = 0

    read_barcode_info_dict = assign_reads_to_barcodes(
        thread_id, barcode_start_pos, barcode_start_pos + barcode_len,
        left_tail_barcode_compare_paf, right_tail_barcode_compare_paf)
    output_summary(barcode_info, read_barcode_info_dict, tmp_out_prefix)

    return
コード例 #12
0
ファイル: ampBinner.py プロジェクト: WGLab/AmpBinner
def main():

    input_args = parse_user_arguments()

    if input_args.num_threads < 1:
        tk.eprint('ERROR: `--num_threads` should be a positive number.')
        sys.exit()
    if input_args.in_fq == '' and input_args.in_fq_list == '':
        tk.eprint(
            'ERROR! No input file! Both `--in_fq` and in_fq_list were not supplied. '
        )
        sys.exit()
    if input_args.in_fq != '' and input_args.in_fq_list != '':
        tk.eprint(
            'ERROR! `--in_fq` and `--in_fq_list` should not be supplied at the same time.'
        )
        sys.exit()

    if input_args.fwd_barcode_fasta == '' and input_args.rev_barcode_fasta == '':
        tk.eprint(
            'ERROR! Both `--fwd_barcode_fasta` and `--rev_barcode_fasta` are not supplied.'
        )
        sys.exit()

    if input_args.minimap2 != 'minimap2':
        input_args.minimap2 = os.path.abspath(input_args.minimap2)

    input_args.out_dir = os.path.abspath(input_args.out_dir)
    if '/' in input_args.exp_name or '\\' in input_args.exp_name:
        tk.eprint(
            '''ERROR! `--exp_name` should not have special characters such as '/' or '\\'.'''
        )
        sys.exit()

    AmpliconBinner(input_args)

    return
コード例 #13
0
def demultiplex1barcode_method1(thread_id, left_tail_fastq_file,
                                right_tail_fastq_file, minimap2, barcode_info,
                                tmp_out_prefix):

    barcode_template_file = tmp_out_prefix + '.barcode_with_anchor.fasta'

    left_tail_barcode_compare_paf = tmp_out_prefix + '.left_tail_barcode_compare.paf'
    right_tail_barcode_compare_paf = tmp_out_prefix + '.right_tail_barcode_compare.paf'

    barcode_len = len(barcode_info.barcode_list[0])
    generate_barcode_template_file(barcode_info, barcode_template_file, 'none',
                                   0)
    target_seq_len = len(barcode_info.upstream_seq) + barcode_len + len(
        barcode_info.downstream_seq)

    short_para = ' -k 3 -w 2 -n 1 -m 10 -s 40 '
    mid_para = ' -k 5 -w 3 -n 1 -m 10 -s 40 '
    normal_para = ' '

    general_para = ' -x map-ont -t 1 --for-only --eqx -c --cs -N 200 -K 1M '

    short_para += general_para
    mid_para += general_para
    normal_para += general_para

    if target_seq_len < 25:
        para = short_para
    elif target_seq_len < 50:
        para = mid_para
    else:
        para = normal_para

    tk.eprint('NOTICE: aligning reads to barcodes with anchors')
    tk.minimap2_align(left_tail_fastq_file, barcode_template_file, minimap2,
                      para, left_tail_barcode_compare_paf)
    tk.minimap2_align(right_tail_fastq_file, barcode_template_file, minimap2,
                      para, right_tail_barcode_compare_paf)

    tk.eprint('NOTICE: (process %d) assigning reads to barcodes' % thread_id)
    barcode_start_pos = len(barcode_info.upstream_seq)

    tk.eprint('DEBUG: barcode_start_pos = %d' % barcode_start_pos)
    read_barcode_info_dict = assign_reads_to_barcodes(
        thread_id, barcode_start_pos, barcode_start_pos + barcode_len,
        left_tail_barcode_compare_paf, right_tail_barcode_compare_paf)
    output_summary(barcode_info, read_barcode_info_dict, tmp_out_prefix)

    return
コード例 #14
0
def AmpliconBinner_10X(input_args):

    tmp_out_prefix = input_args.out_prefix + '.tmp'

    num_threads = input_args.num_threads
    minimap2 = input_args.minimap2

    barcode_info = BarcodeInfo()
    barcode_info.init(input_args.barcode_list, input_args.barcode_upstream_seq)

    raw_input_fq_list = list()
    if input_args.in_fq != '':
        input_args.in_fq = os.path.abspath(input_args.in_fq)
        raw_input_fq_list.append(input_args.in_fq)
    if input_args.in_fq_list != '':
        raw_input_fq_list = tk.read_list_file(input_args.in_fq_list,
                                              abspath=True)

    tk.eprint('NOTICE: preprocessing the input fastq file')
    fastq_file_list = tk.split_fastq(
        raw_input_fq_list, num_threads,
        tmp_out_prefix)  # 1. split 2. remove duplicates

    process_list = list()

    for i in range(0, num_threads):
        p = Process(target=demultiplex1barcode,
                    args=(i, fastq_file_list, barcode_info, minimap2,
                          tmp_out_prefix))
        process_list.append(p)

    for p in process_list:
        p.start()

    for p in process_list:
        p.join()

    merge_thread_summary_file(num_threads, input_args.out_prefix)

    cmd = 'rm %s*' % (tmp_out_prefix)
    ret = os.system(cmd)
    if ret != 0:
        tk.eprint('ERROR: Failed to run command: %s' % cmd)
        tk.eprint('Return value is: %d' % ret)
        sys.exit()

    return
コード例 #15
0
def merge_thread_summary_file(num_threads, out_prefix):
    out_file_list = list()
    out_stat_file_list = list()
    out_all_read_barcode_file_list = list()
    tmp_out_prefix = out_prefix + '.tmp'
    for i in range(0, num_threads):
        out_file = tmp_out_prefix + '.thread%d' % i + '.demultiplexing.reads.txt'
        out_stat_file = tmp_out_prefix + '.thread%d' % i + '.demultiplexing.statistics.txt'
        out_all_read_barcode_file = tmp_out_prefix + '.thread%d' % i + '.all_reads.txt'

        out_file_list.append(out_file)
        out_stat_file_list.append(out_stat_file)
        out_all_read_barcode_file_list.append(out_all_read_barcode_file)

    final_out_file = out_prefix + '.demultiplexing.PASS.reads.txt'
    final_out_stat_file = out_prefix + '.demultiplexing.statistics.txt'
    final_all_read_barcode_file = out_prefix + '.all_reads.txt'

    header = '#readname\tbest_matched_barcode\tnum_edit_bases\tmismatch|insertion|deletion\tstrand\tsecond_best_matched_barcode\tnum_edit_bases\tmismatch|insertion|deletion\tstrand\n'
    final_out_fp = open(final_out_file, 'w')
    final_out_fp.write(header)
    final_out_fp.close()

    final_all_read_barcode_fp = open(final_all_read_barcode_file, 'w')
    final_all_read_barcode_fp.write(header)
    final_all_read_barcode_fp.close()

    cmd = 'cat '
    for f in out_file_list:
        cmd += ' %s ' % f
    cmd += ' >> %s' % final_out_file
    ret = os.system(cmd)
    if ret != 0:
        tk.eprint('ERROR: Failed to run command: %s' % cmd)
        tk.eprint('Return value is: %d' % ret)
        sys.exit()

    cmd = 'cat '
    for f in out_all_read_barcode_file_list:
        cmd += ' %s ' % f
    cmd += ' > %s' % final_all_read_barcode_file
    ret = os.system(cmd)
    if ret != 0:
        tk.eprint('ERROR: Failed to run command: %s' % cmd)
        tk.eprint('Return value is: %d' % ret)
        sys.exit()

    barcode_count_dict = dict()
    for stat_file in out_stat_file_list:
        stat_fp = open(stat_file, 'r')
        lines = list(stat_fp)
        stat_fp.close()
        for line in lines:
            col_list = line.strip().split('\t')
            barcode = col_list[0]
            count = int(col_list[1])
            if barcode not in barcode_count_dict:
                barcode_count_dict[barcode] = count
            else:
                barcode_count_dict[barcode] += count

    barcode_count_sorted_list = sorted(barcode_count_dict.items(),
                                       key=lambda x: x[1],
                                       reverse=True)
    final_out_stat_fp = open(final_out_stat_file, 'w')
    final_out_stat_fp.write('#cellular_barcode_seq\tnum_reads\n')
    for x in barcode_count_sorted_list:
        final_out_stat_fp.write('%s\t%d\n' % (x[0], x[1]))
    final_out_stat_fp.close()

    return
コード例 #16
0
def assign_reads_to_barcodes(thread_id, barcode_start_pos, barcode_end_pos,
                             left_tail_paf_file, right_tail_paf_file):

    read_barcode_info_dict = dict()

    max_num_align_retain = 3
    left_tail_paf_fp = open(left_tail_paf_file, 'r')
    while 1:
        line = left_tail_paf_fp.readline()
        if not line: break
        col_list = line.strip().split('\t')
        readname = col_list[0]
        align_info = AlignmentInfo()
        try:
            align_info.target_len = int(col_list[6])
        except:
            tk.eprint('ERROR! file is: %s' % left_tail_paf_file)
            sys.exit()

        align_info.target_start = int(col_list[7])
        align_info.target_end = int(col_list[8])
        align_info.barcode = col_list[5]
        align_info.strand = 1
        align_info.mapq = int(col_list[11])
        for col in col_list[12:]:
            if col[0:5] == 'AS:i:':
                align_info.score = int(col[5:])
            if col[0:5] == 'cg:Z:':
                align_info.cigar = col[5:]
                if align_info.score > 0 and len(align_info.cigar) > 0: break

        align_info.calculate_barcode_mismatch(barcode_start_pos,
                                              barcode_end_pos)
        if readname not in read_barcode_info_dict:
            read_barcode_info_dict[readname] = list()

        read_barcode_info_dict[readname].append(align_info)
        read_barcode_info_dict[readname].sort(
            key=lambda align_info: align_info.total_num_edit_bases)
        read_barcode_info_dict[readname] = read_barcode_info_dict[readname][
            0:max_num_align_retain]

    left_tail_paf_fp.close()

    right_tail_paf_fp = open(right_tail_paf_file, 'r')
    while 1:
        line = right_tail_paf_fp.readline()
        if not line: break
        col_list = line.strip().split('\t')
        readname = col_list[0]
        align_info = AlignmentInfo()
        try:
            align_info.target_len = int(col_list[6])
        except:
            tk.eprint('ERROR! file is: %s' % right_tail_paf_file)
            sys.exit()
        align_info.target_start = int(col_list[7])
        align_info.target_end = int(col_list[8])
        align_info.barcode = col_list[5]
        align_info.strand = -1
        align_info.mapq = int(col_list[11])
        for col in col_list[12:]:
            if col[0:5] == 'AS:i:':
                align_info.score = int(col[5:])
            if col[0:5] == 'cg:Z:':
                align_info.cigar = col[5:]
                if align_info.score > 0 and len(align_info.cigar) > 0: break

        align_info.calculate_barcode_mismatch(barcode_start_pos,
                                              barcode_end_pos)
        if readname not in read_barcode_info_dict:
            read_barcode_info_dict[readname] = list()

        read_barcode_info_dict[readname].append(align_info)
        read_barcode_info_dict[readname].sort(
            key=lambda align_info: align_info.total_num_edit_bases)
        read_barcode_info_dict[readname] = read_barcode_info_dict[readname][
            0:max_num_align_retain]

    right_tail_paf_fp.close()

    for readname in read_barcode_info_dict:
        align_info_list = read_barcode_info_dict[readname]
        align_info_list.sort(
            key=lambda align_info: align_info.total_num_edit_bases)

    return read_barcode_info_dict
コード例 #17
0
ファイル: ampBinner.py プロジェクト: WGLab/AmpBinner
def demultiplex1side(barcode_info, in_fastq_file, minimap2, n_threads,
                     out_prefix):

    tmp_out_prefix = out_prefix + '.tmp.%s' % barcode_info.side
    read_tail_add_length = 256
    barcode_length = len(barcode_info.barcode_seq_list[0])

    barcode_plus_seq_file = tmp_out_prefix + '.barcode_plus%dbp.fasta' % barcode_info.anchor_seq_len

    generate_barcode_plus_tail_file(barcode_info, barcode_plus_seq_file)

    min_read_length = int(len(barcode_info.amplicon_seq) * 0.667)
    max_read_length = int(
        (len(barcode_info.amplicon_seq) + barcode_length * 2) * 1.5)

    tk.eprint('NOTICE: length of amplicon is %d' %
              len(barcode_info.amplicon_seq))
    tk.eprint('NOTICE: reads shorter than %d bp would be skipped' %
              min_read_length)
    tk.eprint('NOTICE: reads longer  than %d bp would be skipped' %
              max_read_length)

    read_tail_length = barcode_length + barcode_info.anchor_seq_len + read_tail_add_length

    left_tail_fastq_file = '%s.left%dbp_tail.fastq' % (tmp_out_prefix,
                                                       read_tail_length)
    right_tail_fastq_file = '%s.right%dbp_tail.fastq' % (tmp_out_prefix,
                                                         read_tail_length)

    extract_fastq_tail_seq(in_fastq_file, read_tail_length, min_read_length,
                           max_read_length, left_tail_fastq_file,
                           right_tail_fastq_file)

    left_tail_sam_file = '%s.left%dbp_tail.sam' % (tmp_out_prefix,
                                                   read_tail_length)
    right_tail_sam_file = '%s.right%dbp_tail.sam' % (tmp_out_prefix,
                                                     read_tail_length)

    cmd = '%s -N 400 --cs -t %d -a -x map-ont %s %s > %s 2> /dev/null' % (
        minimap2, n_threads, barcode_plus_seq_file, left_tail_fastq_file,
        left_tail_sam_file)
    tk.run_system_cmd(cmd)

    cmd = '%s -N 400 --cs -t %d -a -x map-ont %s %s > %s 2> /dev/null' % (
        minimap2, n_threads, barcode_plus_seq_file, right_tail_fastq_file,
        right_tail_sam_file)
    tk.run_system_cmd(cmd)

    barcode_info.read_barcode_idx_dict = dict(
    )  # read_barcode_idx_dict[readname] = barcode_idx
    extract_confident_reads_from_sam(
        left_tail_sam_file, barcode_length,
        barcode_info.barcode_plus_seq_to_barcode_idx_dict,
        barcode_info.read_barcode_idx_dict)
    extract_confident_reads_from_sam(
        right_tail_sam_file, barcode_length,
        barcode_info.barcode_plus_seq_to_barcode_idx_dict,
        barcode_info.read_barcode_idx_dict)

    cmd = 'rm %s*' % tmp_out_prefix
    tk.run_system_cmd(cmd)
    return
コード例 #18
0
ファイル: ampBinner.py プロジェクト: WGLab/AmpBinner
def output_binned_reads_both1(in_fastq_file, fwd_barcode_info,
                              rev_barcode_info, out_prefix):

    readname_to_barcode_file = out_prefix + '.readname_to_barcode.txt'
    summary_file = out_prefix + '.summary.txt'

    readname_to_sample_idx_dict = dict()
    barcode_read_count_dict = dict()

    discordant_readname_set = set()

    ## fwd_barcode ##
    for readname in fwd_barcode_info.read_barcode_idx_dict:
        fwd_barcode_idx = fwd_barcode_info.read_barcode_idx_dict[readname]
        if readname in rev_barcode_info.read_barcode_idx_dict:
            rev_barcode_idx = rev_barcode_info.read_barcode_idx_dict[readname]

            if fwd_barcode_idx != rev_barcode_idx:
                discordant_readname_set.add(readname)
                continue

        if readname in readname_to_sample_idx_dict and readname_to_sample_idx_dict[
                readname] != fwd_barcode_idx:
            discordant_readname_set.add(readname)
            continue

        readname_to_sample_idx_dict[readname] = fwd_barcode_idx

    ## fwd_barcode ##
    for readname in rev_barcode_info.read_barcode_idx_dict:
        rev_barcode_idx = rev_barcode_info.read_barcode_idx_dict[readname]
        if readname in fwd_barcode_info.read_barcode_idx_dict: continue
        if readname in readname_to_sample_idx_dict and readname_to_sample_idx_dict[
                readname] != rev_barcode_idx:
            discordant_readname_set.add(readname)
            continue
        readname_to_sample_idx_dict[readname] = rev_barcode_idx

    tk.eprint('WARNING: %d reads have discordant barcodes on the two ends' %
              len(discordant_readname_set))

    readname_to_barcode_name_list = list()
    for readname in readname_to_sample_idx_dict:
        barcode_idx = readname_to_sample_idx_dict[readname]
        barcode_name = fwd_barcode_info.barcode_name_list[barcode_idx]
        if barcode_name not in barcode_read_count_dict:
            barcode_read_count_dict[barcode_name] = 1
        else:
            barcode_read_count_dict[barcode_name] += 1
        readname_to_barcode_name_list.append(
            (barcode_idx, readname, barcode_name))

    readname_to_barcode_name_list.sort(key=lambda x: x[0])
    sorted_barcode_read_count_list = sorted(barcode_read_count_dict.items(),
                                            key=lambda x: x[1],
                                            reverse=True)

    readname_to_barcode_fp = open(readname_to_barcode_file, 'w')
    readname_to_barcode_fp.write('#readname\tbarcode_name\n')
    for x in readname_to_barcode_name_list:
        readname_to_barcode_fp.write('%s\t%s\n' % (x[1], x[2]))
    readname_to_barcode_fp.close()

    summary_fp = open(summary_file, 'w')
    summary_fp.write('#barcode_name\tnum_reads\n')
    for x in sorted_barcode_read_count_list:
        summary_fp.write('%s\t%d\n' % (x[0], x[1]))
    summary_fp.close()

    out_fastq_file_list = list()

    for i in range(0, len(fwd_barcode_info.barcode_name_list)):
        out_fastq_file = out_prefix + '.%s.fastq' % fwd_barcode_info.barcode_name_list[
            i]
        out_fastq_file_list.append(out_fastq_file)

    output_binned_fastq(in_fastq_file, readname_to_sample_idx_dict,
                        out_fastq_file_list)

    return out_fastq_file_list
コード例 #19
0
ファイル: ampBinner.py プロジェクト: WGLab/AmpBinner
def AmpliconBinner(input_args):

    minimap2 = input_args.minimap2
    amplicon_seq_fasta_file = input_args.amp_seq_fasta
    n_threads = input_args.num_threads

    out_prefix = os.path.join(input_args.out_dir, input_args.exp_name)
    tmp_out_prefix = out_prefix + '.tmp'

    tk.create_dir(input_args.out_dir)
    in_fastq_file = preprocessing_input_files(input_args.in_fq,
                                              input_args.in_fq_list,
                                              tmp_out_prefix)

    fwd_barcode_info = BarcodeInfo()
    rev_barcode_info = BarcodeInfo()

    fwd_barcode_out_prefix = os.path.join(input_args.out_dir,
                                          input_args.exp_name) + '.fwd'
    rev_barcode_out_prefix = os.path.join(input_args.out_dir,
                                          input_args.exp_name) + '.rev'

    if input_args.fwd_barcode_fasta != '' and input_args.rev_barcode_fasta == '':
        mode = 'fwd_only'
    elif input_args.fwd_barcode_fasta == '' and input_args.rev_barcode_fasta != '':
        mode = 'rev_only'
    elif input_args.fwd_barcode_fasta != '' and input_args.rev_barcode_fasta != '':
        if input_args.require_two_barcodes:
            mode = 'both2'
        else:
            mode = 'both1'

    if input_args.fwd_barcode_fasta != '':
        fwd_barcode_info.init_from_file(input_args.fwd_barcode_fasta,
                                        amplicon_seq_fasta_file, 'fwd')

        demultiplex1side(fwd_barcode_info, in_fastq_file, minimap2, n_threads,
                         out_prefix)
        if mode == 'fwd_only':
            out_fastq_file_list = output_binned_reads_for1side(
                in_fastq_file, fwd_barcode_info, fwd_barcode_out_prefix)
            remove_empty_out_fastq_file(out_fastq_file_list)

    if input_args.rev_barcode_fasta != '':

        rev_barcode_info.init_from_file(input_args.rev_barcode_fasta,
                                        amplicon_seq_fasta_file, 'rev')

        demultiplex1side(rev_barcode_info, in_fastq_file, minimap2, n_threads,
                         out_prefix)
        if mode == 'rev_only':
            out_fastq_file_list = output_binned_reads_for1side(
                in_fastq_file, rev_barcode_info, rev_barcode_out_prefix)
            remove_empty_out_fastq_file(out_fastq_file_list)

    if mode == 'both1':
        fwd_rev_unmatch = check_fwd_rev_barcodes(fwd_barcode_info,
                                                 rev_barcode_info)
        if fwd_rev_unmatch:
            tk.eprint(
                '''ERROR! fwd barcodes and rev barcodes are different. Please supply the SAME barcode.fasta file if you have barcodes on both fwd and rev primers but only require barcode matching on one end.'''
            )
            tk.eprint(
                '''NOTICE: If you do have DIFFERENT barcodes on both ends and want to bin the reads for each side separately, you can run ampliconBinner twice and supply either '--fwd_barcode_fasta' or '--fwd_barcode_fasta' once a time.'''
            )
            tk.eprint(
                '''NOTICE: If you have barcodes on both ends and want to require barcode matching on both ends, please supply '--require_two_barcodes'. '''
            )
            sys.exit(1)

        both1_out_prefix = os.path.join(
            input_args.out_dir, input_args.exp_name) + '.require1barcode'
        out_fastq_file_list = output_binned_reads_both1(
            in_fastq_file, fwd_barcode_info, rev_barcode_info,
            both1_out_prefix)
        remove_empty_out_fastq_file(out_fastq_file_list)

    elif mode == 'both2':
        both2_out_prefix = os.path.join(
            input_args.out_dir, input_args.exp_name) + '.require2barcodes'
        out_fastq_file_list = output_binned_reads_both2(
            in_fastq_file, fwd_barcode_info, rev_barcode_info,
            both2_out_prefix)
        remove_empty_out_fastq_file(out_fastq_file_list)

    cmd = 'rm %s*' % tmp_out_prefix
    tk.run_system_cmd(cmd)

    return