def pre_process(optmap_i, optmap_file, myfile, myfile2, output_dir, min_confidence): header_lines = 10 header = [] minrefoverhang = 50000 minqryoverhang = 50000 all_alms = { } # stores all the Alignments for all groups, all_groups[ref] should contain molecule ref qualify_alms = { } # only keep one alignment(the one with highest confidence) for each contig in one molecule removed = { } # removed[ref,qry] == True means alignment for (ref, qry) is already removed # collecting alignments and store in all_groups print '---------------read .xmap file-------------------' with open(myfile + '_flip.xmap', 'rb') as csvfile: csvreader = csv.reader(csvfile, delimiter='\t') for i in range(header_lines): # 10 lines of header header.append(csvreader.next()) # save them # read the first non-header line while True: try: row = csvreader.next() x = Alignment(int(row[1]), int(row[2]), float(row[3]), float(row[4]), float(row[5]), float(row[6]), row[7], float(row[8]), row[9], float(row[10]), float(row[11]), int(row[12]), row[13]) if x.ref not in all_alms: all_alms[x.ref] = [x] else: all_alms[x.ref].append(x) except StopIteration: break num_all_alms = 0 for ref in all_alms: num_all_alms += len(all_alms[ref]) print "In total, the number of alignments collected is ", num_all_alms # only keep one alignment(the one with highest confidence) for each contig in one molecule for ref in all_alms: group = all_alms[ref] qry_bestx = {} for x in group: if x.qry not in qry_bestx: qry_bestx[x.qry] = x else: if x.confidence > qry_bestx[x.qry].confidence: qry_bestx[x.qry] = x qualify_alms[ref] = {} for qry in qry_bestx: qualify_alms[ref][qry] = qry_bestx[qry] num_qualify_alms = 0 for ref in qualify_alms: num_qualify_alms += len(qualify_alms[ref]) # initialize removed array for ref in qualify_alms: for qry in qualify_alms[ref]: removed[ref, qry] = False current_alms = copy_alms(qualify_alms, removed) output_alms(current_alms, output_dir + "/opt_" + str(optmap_i) + "_alms_0_initial.log") print "In total, the number of alignments in qualify_alms is ", num_qualify_alms # remove low confidence alignments print '---------------Remove low quality alignments---------------' for ref in qualify_alms: for qry in qualify_alms[ref]: x = qualify_alms[ref][qry] if x.confidence < min_confidence: removed[ref, qry] = True print 'alignment (', ref, ',', qry, ') is low quality and removed' num_alms = 0 for ref in qualify_alms: for qry in qualify_alms[ref]: if removed[ref, qry] == False: num_alms += 1 current_alms = copy_alms(qualify_alms, removed) output_alms( current_alms, output_dir + "/opt_" + str(optmap_i) + "_alms_1_removed_low_conf.log") print "After removing low confidence alignments, the number of alignments is ", num_alms print '---------------End---------------' # read optical map optmap = {} with open(optmap_file) as f_map: for line in f_map: line = line.strip() if line[0] == '#': continue cols = line.split('\t') CMapId = int(cols[0]) LabelChannel = cols[4] Position = float(cols[5]) if CMapId not in optmap: optmap[CMapId] = [] if LabelChannel == "1": optmap[CMapId].append(Position) for CMapId in optmap: optmap[CMapId].sort() print '---------------scaling-------------------' # calculating scaling qry_len = {} with open(myfile2 + '_key.txt') as f_key: for i in range(0, 4): # 4 header lines f_key.readline() for line in f_key: line = line.strip() cols = line.split('\t') qry_id = int(cols[0]) seq_len = int(cols[2]) qry_len[qry_id] = seq_len scaling = 0 num = 0 with open(myfile + '_r.cmap') as f_q: for i in range(0, 11): # 11 header lines f_q.readline() for line in f_q: line = line.strip() cols = line.split('\t') qry_id = int(cols[0]) appr_len = float(cols[1]) seq_len = qry_len[qry_id] scaling += appr_len / seq_len num += 1 scaling /= num # scaling=1.02258059775 scaling = 1.0 # use scaling to adjsut coordinates of alignments for ref in qualify_alms: for qry in qualify_alms[ref]: x = qualify_alms[ref][qry] x.qrystartpos /= scaling x.qryendpos /= scaling x.qrylen /= scaling x.refstartpos /= scaling x.refendpos /= scaling x.reflen /= scaling # use scaling to adjsut coordinates of optial map for ref in optmap: for i in range(0, len(optmap[ref])): optmap[ref][i] /= scaling print '---------------END-------------------' # find the reference-based coordinates for each contig for ref in qualify_alms: for qry in qualify_alms[ref]: x = qualify_alms[ref][qry] if (x.orientation == '+'): x.qry_left_overlen = x.qrystartpos x.qry_right_overlen = x.qrylen - x.qryendpos else: x.qry_left_overlen = x.qrylen - x.qrystartpos x.qry_right_overlen = x.qryendpos x.start = x.refstartpos - x.qry_left_overlen x.end = x.refendpos + x.qry_right_overlen x.ref_left_overlen = x.refstartpos x.ref_right_overlen = x.reflen - x.refendpos if (x.orientation == '+'): x.refstart = x.qrystartpos - x.ref_left_overlen x.refend = x.qryendpos + x.ref_right_overlen else: x.refstart = x.qryendpos - x.ref_right_overlen x.refend = x.qrystartpos + x.ref_left_overlen num_alms = 0 for ref in qualify_alms: for qry in qualify_alms[ref]: if removed[ref, qry] == False: num_alms += 1 current_alms = copy_alms(qualify_alms, removed) output_alms(current_alms, output_dir + "/opt_" + str(optmap_i) + "_alms_2_scaled.log") print "After scaling, the number of alignments is ", num_alms # read qry map qry_markers = {} with open(myfile + '_r.cmap') as f_q: for i in range(11): # 10 lines of header header_line = f_q.readline() for line in f_q: line = line.strip() cols = line.split('\t') CMapId = int(cols[0]) ContigLength = float(cols[1]) NumSites = int(cols[2]) SiteID = int(cols[3]) LabelChannel = cols[4] Position = float(cols[5]) if LabelChannel == "0": continue if CMapId not in qry_markers: qry_markers[CMapId] = [] Position /= scaling qry_markers[CMapId].append(Position) for CMapId in qry_markers: qry_markers[CMapId].sort() f_q.close() print '---------------candidate cutting sites-------------------' fpair = file(output_dir + "/chimeric_pairs_" + str(optmap_i) + ".log", 'w') fpair.write("ref_id\tref_pos\tqry_id\tqry_pos\n") chimeric_pairs = [] for ref in qualify_alms: for qry in qualify_alms[ref]: if removed[ref, qry] == True: continue x = qualify_alms[ref][qry] if (x.confidence > min_confidence): ref_left_overlen = x.refstartpos ref_right_overlen = x.reflen - x.refendpos flag_left = False flag_right = False if (x.qry_left_overlen > minqryoverhang and ref_left_overlen > minrefoverhang and markers_in_qry_left_overhang(qry_markers, x) > 0): flag_left = True chimeric_pairs.append( (x.ref, x.refstartpos, x.qry, x.qrystartpos)) print( x.ref, x.refstartpos, x.qry, x.qrystartpos), "is a pair of candidate cutting sites" fpair.write( str(x.ref) + "\t" + str(x.refstartpos) + "\t" + str(x.qry) + "\t" + str(x.qrystartpos) + "\n") if (x.qry_right_overlen > minqryoverhang and ref_right_overlen > minrefoverhang and markers_in_qry_right_overhang(qry_markers, x) > 0): flag_right = True chimeric_pairs.append( (x.ref, x.refendpos, x.qry, x.qryendpos)) print(x.ref, x.refendpos, x.qry, x.qryendpos), "is a pair of candidate cutting sites" fpair.write( str(x.ref) + "\t" + str(x.refendpos) + "\t" + str(x.qry) + "\t" + str(x.qryendpos) + "\n") if flag_left == True and flag_right == True: removed[ref, qry] = True fpair.close() print '---------------END-------------------' num_alms = 0 for ref in qualify_alms: for qry in qualify_alms[ref]: if removed[ref, qry] == False: num_alms += 1 current_alms = copy_alms(qualify_alms, removed) output_alms( current_alms, output_dir + "/opt_" + str(optmap_i) + "_alms_3_removed_both_overhang.log") print "After removing alignments with both overhangs, the number of alignments is ", num_alms # check overlap between alignments for r in qualify_alms: for q1 in qualify_alms[r]: if removed[r, q1] == True: continue x = qualify_alms[r][q1] for q2 in qualify_alms[r]: if removed[r, q2] == True: continue y = qualify_alms[r][q2] if q1 >= q2: continue if x.refstartpos <= y.refstartpos and y.refstartpos <= x.refendpos: overlap = min(x.refendpos, y.refendpos) - y.refstartpos elif y.refstartpos <= x.refstartpos and x.refstartpos <= y.refendpos: overlap = min(x.refendpos, y.refendpos) - x.refstartpos else: overlap = 0 if overlap >= 20000: if x.confidence < y.confidence: removed[r, q1] = True else: removed[r, q2] = True num_alms = 0 for ref in qualify_alms: for qry in qualify_alms[ref]: if removed[ref, qry] == False: num_alms += 1 current_alms = copy_alms(qualify_alms, removed) output_alms( current_alms, output_dir + "/opt_" + str(optmap_i) + "_alms_4_solved_overlaps.log") print "After removing one of two overlap alignments, the number of alignments is ", num_alms return current_alms, optmap, chimeric_pairs