def mping_genotyper_mp_runner(ril, bam_ref, bam_pseudo, mping2ID_0, binmap, snpmap, bamcheck_file_ref, bamcheck_file_pseudo): genotype = 3 l_flag = 3 r_flag = 3 ref_flag = 3 results = [] bamcheck_file_pseudo = '%s.%s.txt' % (bamcheck_file_pseudo, ril) bamcheck_file_ref = '%s.%s.txt' % (bamcheck_file_ref, ril) if os.path.isfile(bam_ref): if os.path.isfile(bam_pseudo): for mping in sorted(mping2ID_0.keys()): #genotype: 0 ref, 1 non_ref, 3 unknown genotype = genotyping_SNP(ril, mping2ID_0[mping], binmap, snpmap) l_flag, r_flag = bamcheck_ref(bam_pseudo, mping, bamcheck_file_pseudo, ril) ref_flag = bamcheck(bam_ref, mping2ID_0[mping], bamcheck_file_ref, ril) results.append([mping, genotype, l_flag, r_flag, ref_flag]) print '%s\t%s\t%s\t%s' % (ril, mping, l_flag, r_flag) else: print 'pseudo bam file not found for rils: RIL%s' % (ril) else: print 'reference bam file not found for rils: RIL%s' % (ril) return [ril, results]
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--input') parser.add_argument('-o', '--output') parser.add_argument('-v', dest='verbose', action='store_true') args = parser.parse_args() try: len(args.input) > 0 except: usage() sys.exit(2) if not args.output: args.output = 'ping_coverage.matrix' bams = glob.glob('%s/*.bam' %(args.input)) mping = "chr01:2640500-2640502" bamcheck_file_ref = 'temp_bam_check_file.txt' for bam in sorted(bams): strain = os.path.split(bam)[1] strain = re.sub(r'.special_ping_locus.bam', '', strain) ref_flag = bamcheck(bam, mping, bamcheck_file_ref, strain) #1 is insertion, 4 is excision or no insertion, 2 and 3 are not sure print '{}\t{}'.format(strain, ref_flag)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--bam_ref') parser.add_argument('--bam_pseudo') parser.add_argument('--gff_ref') parser.add_argument('--gff_pseudo') parser.add_argument('--bin_map') parser.add_argument('--snp_map') parser.add_argument('--project') parser.add_argument('-v', dest='verbose', action='store_true') args = parser.parse_args() try: len(args.bam_pseudo) > 0 and len(args.bam_ref) > 0 except: usage() sys.exit(2) if not args.project: args.project = 'mPing_boundary' if not args.gff_ref: args.gff_ref = '../input/HEG4.ALL.mping.non-ref.gff' if not args.gff_pseudo: args.gff_pseudo = '../input/MSU_r7.Pseudo_mPing.gff' if not args.bam_ref: args.bam_ref = '../input/RILs_ALL_bam' if not args.bam_pseudo: args.bam_pseudo = '../input/RILs_ALL_unmapped_mping_bam' if not args.bin_map: args.bin_map = '../input/MPR.geno.bin' if not args.snp_map: args.snp_map = '../input/MPR.gene.data' #we use mping gff from pseudogenome as key to project to everything mping2ID_0 = defaultdict(lambda : str()) #ID_0 is the mping id from original call in HEG4, Chr1.1132977 id_mapping(args.gff_pseudo, mping2ID_0) #bin map and snp genotype binmap = convert_MAP(args.bin_map) snpmap = convert_MAP_SNP(args.bin_map) #go through RILs, for each ril determine the status of each mPing bamcheck_file_pseudo = '%s.bamcheck_pseudo.txt' %(args.project) bamcheck_file_ref = '%s.bamcheck_ref.txt' %(args.project) mping_status = defaultdict(lambda : defaultdict(lambda : defaultdict(lambda : str()))) mping_status_ref = defaultdict(lambda : defaultdict(lambda : str())) mping_bin_gt = defaultdict(lambda : defaultdict(lambda : str())) mping_snp_gt = defaultdict(lambda : defaultdict(lambda : defaultdict(lambda : str()))) #rils = [1, 2, 3, 4] bams = glob.glob('%s/*.bam' %(args.bam_pseudo)) rils = get_rils(bams) for ril in sorted(rils): bam_ref = '%s/GN%s.bam' %(args.bam_ref, ril) bam_pseudo = '%s/RIL%s.bam' %(args.bam_pseudo, ril) print 'ril: %s, %s' %(ril, bam_pseudo) if os.path.isfile(bam_pseudo): for mping in sorted(mping2ID_0.keys()): #genotype: 0 ref, 1 non_ref, 3 unknown genotype = genotyping_SNP(ril, mping, binmap, snpmap) l_flag, r_flag = bamcheck_ref(bam_pseudo, mping, bamcheck_file_pseudo, ril) ref_flag = bamcheck(bam_ref, mping2ID_0[mping], bamcheck_file_ref, ril) #print '%s\t%s\t%s\t%s' %(ril, mping, l_flag, r_flag) mping_status[ril][mping2ID_0[mping]]['up'] = decode(l_flag) mping_status[ril][mping2ID_0[mping]]['down'] = decode(r_flag) mping_status_ref[ril][mping2ID_0[mping]] = decode(ref_flag) mping_bin_gt[ril][mping2ID_0[mping]] = decode_gt(genotype) else: print 'bam file not found for rils: RIL%s' %(ril) #output matrix into file matrix_file = '%s.mping_status.matrix.txt' %(args.project) ofile = open(matrix_file, 'w') mping_ranked= sort_mping_chr(mping2ID_0) #mping names #print >> ofile, 'mPing,%s' %(','.join(mping_ranked)) mping_lines = ['mPing'] for m in mping_ranked: mping_lines.append('%s,Genotype,Pseudo_up,Pseudo_down,Ref' %(m)) print >> ofile, ','.join(mping_lines) #mping genotype #mping status, matirx for ril in sorted(mping_status.keys(), key=int): inf_line = ['RIL%s' %(ril)] for mping in mping_ranked: inf_line.append(mping_bin_gt[ril][mping]) inf_line.append(mping_status[ril][mping]['up']) inf_line.append(mping_status[ril][mping]['down']) inf_line.append(mping_status_ref[ril][mping]) #status = '%s:%s' %(mping_status[ril][mping]['up'], mping_status[ril][mping]['down']) #inf_line.append(status) print >> ofile, ','.join(inf_line) ofile.close() #output matrix for individual mPing file outdir = '%s_mPing' %(os.path.abspath(args.project)) if not os.path.exists(outdir): os.mkdir(outdir) sum_lines_matrix = defaultdict(lambda : list()) for mping in mping_ranked: sum_gt = [0, 0, 0] sum_ref_coverage = defaultdict(lambda : defaultdict(lambda : int())) sum_ref_covered = defaultdict(lambda : defaultdict(lambda : int())) sum_ref_clipped = defaultdict(lambda : defaultdict(lambda : int())) sum_nonref_coverage = defaultdict(lambda : defaultdict(lambda : int())) sum_nonref_covered = defaultdict(lambda : defaultdict(lambda : int())) sum_nonref_clipped = defaultdict(lambda : defaultdict(lambda : int())) mping_name = re.sub(r':', r'_', mping) mping_name = re.sub(r'-', r'_', mping_name) ofile = open('%s/%s.matrix.csv' %(outdir, mping_name), 'w') #print >> ofile, 'RILs\tDepth(X)\tGenotype_Bin\tDistance_SNP5\tGenotype_SNP5\tDistance_SNP3\tGenotype_SNP3\tmPing_status' print >> ofile, '%s,Genotype_Bin,Pseudo_mPing_status_up,Pseudo_mPing_status_down,Ref_mPing_status' %(mping) for ril in sorted(mping_status.keys(), key=int): #status in pseudo and ref genome have different results #in pseudogenome, cover mean junction was covered by reads, which indicates insertion #in refgenome, cover mean junction was covered by reads, which indicates no insertion or excision #status = '%s:%s' %(mping_status[ril][mping]['up'], mping_status[ril][mping]['down']) status_up = mping_status[ril][mping]['up'] status_down= mping_status[ril][mping]['down'] status_ref = mping_status_ref[ril][mping] print >> ofile, 'RIL%s,%s,%s,%s,%s' %(ril, mping_bin_gt[ril][mping], status_up, status_down, status_ref) if mping_bin_gt[ril][mping] == 'NB': sum_gt[0] += 1 elif mping_bin_gt[ril][mping] == 'HEG4': sum_gt[1] += 1 else: sum_gt[2] += 1 if 1: gt = mping_bin_gt[ril][mping] #summary on mping status in ref mapping bam, 0 mean only one status for each mPing if status_ref == 'covered': sum_ref_coverage[gt][0] += 1 sum_ref_covered[gt][0] += 1 elif status_ref == 'clipped': sum_ref_coverage[gt][0] += 1 sum_ref_clipped[gt][0] += 1 #summary on mping status in pseudo mapping bam, 0 mean upstream status and 1 mean downstream status if mping_status[ril][mping]['up'] == 'covered': sum_nonref_coverage[gt][0] += 1 sum_nonref_covered[gt][0] += 1 elif mping_status[ril][mping]['up'] == 'clipped': sum_nonref_coverage[gt][0] += 1 sum_nonref_clipped[gt][0] += 1 if mping_status[ril][mping]['down'] == 'covered': sum_nonref_coverage[gt][1] += 1 sum_nonref_covered[gt][1] += 1 elif mping_status[ril][mping]['down'] == 'clipped': sum_nonref_coverage[gt][1] += 1 sum_nonref_clipped[gt][1] += 1 #total line, perfectage of coverage total_gt = (sum_gt[0] + sum_gt[1])/float(sum(sum_gt)) total_up = sum_nonref_coverage['NB'][0]/float(sum(sum_gt)) + sum_nonref_coverage['HEG4'][0]/float(sum(sum_gt)) + sum_nonref_coverage['NA'][0]/float(sum(sum_gt)) total_down = sum_nonref_coverage['NB'][1]/float(sum(sum_gt)) + sum_nonref_coverage['HEG4'][1]/float(sum(sum_gt)) + sum_nonref_coverage['NA'][1]/float(sum(sum_gt)) total_ref = sum_ref_coverage['NB'][0]/float(sum(sum_gt)) + sum_ref_coverage['HEG4'][0]/float(sum(sum_gt)) + sum_ref_coverage['NA'][0]/float(sum(sum_gt)) print >> ofile, 'Total,%s,%s,%s,%s' %(total_gt, total_up, total_down, total_ref) sum_lines_matrix['total'].append('%s,%s,%s,%s' %(total_gt, total_up, total_down, total_ref)) #NB line, covered/clipped/unknown nb_gt = sum_gt[0] nb_up = '%s:%s:%s' %(sum_nonref_covered['NB'][0], sum_nonref_clipped['NB'][0], sum_gt[0]-sum_nonref_covered['NB'][0]-sum_nonref_clipped['NB'][0]) nb_down = '%s:%s:%s' %(sum_nonref_covered['NB'][1], sum_nonref_clipped['NB'][1], sum_gt[0]-sum_nonref_covered['NB'][1]-sum_nonref_clipped['NB'][1]) nb_ref = '%s:%s:%s' %(sum_ref_covered['NB'][0], sum_ref_clipped['NB'][0], sum_gt[0]-sum_ref_covered['NB'][0]-sum_ref_clipped['NB'][0]) print >> ofile, 'NB,%s,%s,%s,%s' %(nb_gt, nb_up, nb_down, nb_ref) sum_lines_matrix['nb'].append('%s,%s,%s,%s' %(nb_gt, nb_up, nb_down, nb_ref)) #HEG4 line, covered/clipped/unknown heg4_gt = sum_gt[1] heg4_up = '%s:%s:%s' %(sum_nonref_covered['HEG4'][0], sum_nonref_clipped['HEG4'][0], sum_gt[1]-sum_nonref_covered['HEG4'][0]-sum_nonref_clipped['HEG4'][0]) heg4_down = '%s:%s:%s' %(sum_nonref_covered['HEG4'][1], sum_nonref_clipped['HEG4'][1], sum_gt[1]-sum_nonref_covered['HEG4'][1]-sum_nonref_clipped['HEG4'][1]) heg4_ref = '%s:%s:%s' %(sum_ref_covered['HEG4'][0], sum_ref_clipped['HEG4'][0], sum_gt[1]-sum_ref_covered['HEG4'][0]-sum_ref_clipped['HEG4'][0]) print >> ofile, 'HEG4,%s,%s,%s,%s' %(heg4_gt, heg4_up, heg4_down, heg4_ref) sum_lines_matrix['heg4'].append('%s,%s,%s,%s' %(heg4_gt, heg4_up, heg4_down, heg4_ref)) #NA line, covered/clipped/unknown na_gt = sum_gt[2] na_up = '%s:%s:%s' %(sum_nonref_covered['NA'][0], sum_nonref_clipped['NA'][0], sum_gt[2]-sum_nonref_covered['NA'][0]-sum_nonref_clipped['NA'][0]) na_down = '%s:%s:%s' %(sum_nonref_covered['NA'][1], sum_nonref_clipped['NA'][1], sum_gt[2]-sum_nonref_covered['NA'][1]-sum_nonref_clipped['NA'][1]) na_ref = '%s:%s:%s' %(sum_ref_covered['NA'][0], sum_ref_clipped['NA'][0], sum_gt[2]-sum_ref_covered['NA'][0]-sum_ref_clipped['NA'][0]) print >> ofile, 'NA,%s,%s,%s,%s' %(na_gt, na_up, na_down, na_ref) sum_lines_matrix['na'].append('%s,%s,%s,%s' %(na_gt, na_up, na_down, na_ref)) ofile.close() #add summary to big matrix ofile = open(matrix_file, 'a') print >> ofile, '%s,%s' %('Total', ','.join(sum_lines_matrix['total'])) print >> ofile, '%s,%s' %('NB', ','.join(sum_lines_matrix['nb'])) print >> ofile, '%s,%s' %('HEG4', ','.join(sum_lines_matrix['heg4'])) print >> ofile, '%s,%s' %('NA', ','.join(sum_lines_matrix['na'])) ofile.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--ref_gff') parser.add_argument('--qry_gff') parser.add_argument('--ref_bam') parser.add_argument('--qry_bam') parser.add_argument('--project') parser.add_argument('-v', dest='verbose', action='store_true') args = parser.parse_args() try: len(args.ref_gff) > 0 and len(args.qry_gff) > 0 except: usage() sys.exit(2) if not args.project: args.project = 'HEG4vsEG4' #summary shared = 0 ref_sum = defaultdict(lambda: int()) qry_sum = defaultdict(lambda: int()) #bamcheck files bamcheck_file_ref = '%s.bamcheck.txt' % (os.path.splitext( os.path.split(args.ref_bam)[1])[0]) bamcheck_file_qry = '%s.bamcheck.txt' % (os.path.splitext( os.path.split(args.qry_bam)[1])[0]) #parse gff of mping file ref_gff_dict = gff_parser(args.ref_gff) qry_gff_dict = gff_parser(args.qry_gff) #get mping from both strain into one dict mpings_all = ref_gff_dict.keys() mpings_all.extend(qry_gff_dict.keys()) mpings_all = list(set(mpings_all)) #classify shared and unique mPing, for unique mping check for footprint of excision ofile = open('%s.mPing.Excision.table' % (args.project), 'w') for mping in mpings_all: if ref_gff_dict.has_key(mping) and qry_gff_dict.has_key(mping): print >> ofile, '%s\t%s\t%s\tNA\tNA' % (mping, 1, 1) shared += 1 elif ref_gff_dict.has_key(mping): mping_format = '%s:%s-%s' % (ref_gff_dict[mping][0][0], ref_gff_dict[mping][0][3], ref_gff_dict[mping][0][4]) flag = bamcheck(args.qry_bam, mping_format, bamcheck_file_qry, 'qry') print >> ofile, '%s\t%s\t%s\t%s\t%s' % (mping, 1, 0, flag, decode(flag)) ref_sum['unique'] += 1 if int(flag) == 0: ref_sum['footprint'] += 1 elif qry_gff_dict.has_key(mping): mping_format = '%s:%s-%s' % (qry_gff_dict[mping][0][0], qry_gff_dict[mping][0][3], qry_gff_dict[mping][0][4]) flag = bamcheck(args.ref_bam, mping_format, bamcheck_file_ref, 'ref') print >> ofile, '%s\t%s\t%s\t%s\t%s' % (mping, 0, 1, flag, decode(flag)) qry_sum['unique'] += 1 if int(flag) == 0: qry_sum['footprint'] += 1 else: #will not happen print >> ofile, 'NA\t0\t0\tNA\tNA' ofile.close() #output summary ofile = open('%s.mPing.Excision.sum' % (args.project), 'w') print >> ofile, 'Number of Reference mPing: %s' % (len(ref_gff_dict)) print >> ofile, 'Number of Query mPing: %s' % (len(qry_gff_dict)) print >> ofile, 'Number of Shared mPing: %s' % (shared) print >> ofile, 'Number of unique Ref mPing: %s and %s with footprint in Qry' % ( ref_sum['unique'], ref_sum['footprint']) print >> ofile, 'Number of unique Qry mPing: %s and %s with footprint in Ref' % ( qry_sum['unique'], qry_sum['footprint']) ofile.close()