Example #1
0
def pre_process(optmap_i, optmap_file, myfile, myfile2, output_dir,
                min_confidence):
    header_lines = 10
    header = []
    minrefoverhang = 50000
    minqryoverhang = 50000

    all_alms = {
    }  # stores all the Alignments for all groups, all_groups[ref] should contain molecule ref
    qualify_alms = {
    }  # only keep one alignment(the one with highest confidence) for each contig in one molecule
    removed = {
    }  # removed[ref,qry] == True means alignment for (ref, qry) is already removed

    # collecting alignments and store in all_groups
    print '---------------read .xmap file-------------------'
    with open(myfile + '_flip.xmap', 'rb') as csvfile:
        csvreader = csv.reader(csvfile, delimiter='\t')
        for i in range(header_lines):  # 10 lines of header
            header.append(csvreader.next())  # save them
        # read the first non-header line
        while True:
            try:
                row = csvreader.next()
                x = Alignment(int(row[1]), int(row[2]), float(row[3]),
                              float(row[4]), float(row[5]), float(row[6]),
                              row[7], float(row[8]), row[9], float(row[10]),
                              float(row[11]), int(row[12]), row[13])
                if x.ref not in all_alms:
                    all_alms[x.ref] = [x]
                else:
                    all_alms[x.ref].append(x)
            except StopIteration:
                break
    num_all_alms = 0
    for ref in all_alms:
        num_all_alms += len(all_alms[ref])
    print "In total, the number of alignments collected is ", num_all_alms

    # only keep one alignment(the one with highest confidence) for each contig in one molecule
    for ref in all_alms:
        group = all_alms[ref]
        qry_bestx = {}
        for x in group:
            if x.qry not in qry_bestx:
                qry_bestx[x.qry] = x
            else:
                if x.confidence > qry_bestx[x.qry].confidence:
                    qry_bestx[x.qry] = x

        qualify_alms[ref] = {}
        for qry in qry_bestx:
            qualify_alms[ref][qry] = qry_bestx[qry]

    num_qualify_alms = 0
    for ref in qualify_alms:
        num_qualify_alms += len(qualify_alms[ref])
    # initialize removed array
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            removed[ref, qry] = False
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(current_alms,
                output_dir + "/opt_" + str(optmap_i) + "_alms_0_initial.log")
    print "In total, the number of alignments in qualify_alms is ", num_qualify_alms

    # remove low confidence alignments
    print '---------------Remove low quality alignments---------------'
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            x = qualify_alms[ref][qry]
            if x.confidence < min_confidence:
                removed[ref, qry] = True
                print 'alignment (', ref, ',', qry, ') is low quality and removed'
    num_alms = 0
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            if removed[ref, qry] == False:
                num_alms += 1
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(
        current_alms,
        output_dir + "/opt_" + str(optmap_i) + "_alms_1_removed_low_conf.log")
    print "After removing low confidence alignments, the number of alignments is ", num_alms
    print '---------------End---------------'

    # read optical map
    optmap = {}
    with open(optmap_file) as f_map:
        for line in f_map:
            line = line.strip()
            if line[0] == '#':
                continue
            cols = line.split('\t')
            CMapId = int(cols[0])
            LabelChannel = cols[4]
            Position = float(cols[5])

            if CMapId not in optmap:
                optmap[CMapId] = []
            if LabelChannel == "1":
                optmap[CMapId].append(Position)
    for CMapId in optmap:
        optmap[CMapId].sort()

    print '---------------scaling-------------------'
    # calculating scaling
    qry_len = {}
    with open(myfile2 + '_key.txt') as f_key:
        for i in range(0, 4):  # 4 header lines
            f_key.readline()
        for line in f_key:
            line = line.strip()
            cols = line.split('\t')
            qry_id = int(cols[0])
            seq_len = int(cols[2])
            qry_len[qry_id] = seq_len
    scaling = 0
    num = 0
    with open(myfile + '_r.cmap') as f_q:
        for i in range(0, 11):  # 11 header lines
            f_q.readline()
        for line in f_q:
            line = line.strip()
            cols = line.split('\t')
            qry_id = int(cols[0])
            appr_len = float(cols[1])
            seq_len = qry_len[qry_id]
            scaling += appr_len / seq_len
            num += 1
    scaling /= num  # scaling=1.02258059775
    scaling = 1.0
    # use scaling to adjsut coordinates of alignments
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            x = qualify_alms[ref][qry]
            x.qrystartpos /= scaling
            x.qryendpos /= scaling
            x.qrylen /= scaling
            x.refstartpos /= scaling
            x.refendpos /= scaling
            x.reflen /= scaling

    # use scaling to adjsut coordinates of optial map
    for ref in optmap:
        for i in range(0, len(optmap[ref])):
            optmap[ref][i] /= scaling

    print '---------------END-------------------'

    # find the reference-based coordinates for each contig
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            x = qualify_alms[ref][qry]
            if (x.orientation == '+'):
                x.qry_left_overlen = x.qrystartpos
                x.qry_right_overlen = x.qrylen - x.qryendpos
            else:
                x.qry_left_overlen = x.qrylen - x.qrystartpos
                x.qry_right_overlen = x.qryendpos
            x.start = x.refstartpos - x.qry_left_overlen
            x.end = x.refendpos + x.qry_right_overlen
            x.ref_left_overlen = x.refstartpos
            x.ref_right_overlen = x.reflen - x.refendpos
            if (x.orientation == '+'):
                x.refstart = x.qrystartpos - x.ref_left_overlen
                x.refend = x.qryendpos + x.ref_right_overlen
            else:
                x.refstart = x.qryendpos - x.ref_right_overlen
                x.refend = x.qrystartpos + x.ref_left_overlen

    num_alms = 0
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            if removed[ref, qry] == False:
                num_alms += 1
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(current_alms,
                output_dir + "/opt_" + str(optmap_i) + "_alms_2_scaled.log")
    print "After scaling, the number of alignments is ", num_alms

    # read qry map
    qry_markers = {}
    with open(myfile + '_r.cmap') as f_q:
        for i in range(11):  # 10 lines of header
            header_line = f_q.readline()
        for line in f_q:
            line = line.strip()
            cols = line.split('\t')
            CMapId = int(cols[0])
            ContigLength = float(cols[1])
            NumSites = int(cols[2])
            SiteID = int(cols[3])
            LabelChannel = cols[4]
            Position = float(cols[5])
            if LabelChannel == "0":
                continue
            if CMapId not in qry_markers:
                qry_markers[CMapId] = []
            Position /= scaling
            qry_markers[CMapId].append(Position)
    for CMapId in qry_markers:
        qry_markers[CMapId].sort()
    f_q.close()

    print '---------------candidate cutting sites-------------------'
    fpair = file(output_dir + "/chimeric_pairs_" + str(optmap_i) + ".log", 'w')
    fpair.write("ref_id\tref_pos\tqry_id\tqry_pos\n")
    chimeric_pairs = []

    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            if removed[ref, qry] == True:
                continue
            x = qualify_alms[ref][qry]

            if (x.confidence > min_confidence):
                ref_left_overlen = x.refstartpos
                ref_right_overlen = x.reflen - x.refendpos
                flag_left = False
                flag_right = False
                if (x.qry_left_overlen > minqryoverhang
                        and ref_left_overlen > minrefoverhang
                        and markers_in_qry_left_overhang(qry_markers, x) > 0):
                    flag_left = True
                    chimeric_pairs.append(
                        (x.ref, x.refstartpos, x.qry, x.qrystartpos))
                    print(
                        x.ref, x.refstartpos, x.qry,
                        x.qrystartpos), "is a pair of candidate cutting sites"
                    fpair.write(
                        str(x.ref) + "\t" + str(x.refstartpos) + "\t" +
                        str(x.qry) + "\t" + str(x.qrystartpos) + "\n")
                if (x.qry_right_overlen > minqryoverhang
                        and ref_right_overlen > minrefoverhang
                        and markers_in_qry_right_overhang(qry_markers, x) > 0):
                    flag_right = True
                    chimeric_pairs.append(
                        (x.ref, x.refendpos, x.qry, x.qryendpos))
                    print(x.ref, x.refendpos, x.qry,
                          x.qryendpos), "is a pair of candidate cutting sites"
                    fpair.write(
                        str(x.ref) + "\t" + str(x.refendpos) + "\t" +
                        str(x.qry) + "\t" + str(x.qryendpos) + "\n")
                if flag_left == True and flag_right == True:
                    removed[ref, qry] = True
    fpair.close()
    print '---------------END-------------------'
    num_alms = 0
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            if removed[ref, qry] == False:
                num_alms += 1
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(
        current_alms, output_dir + "/opt_" + str(optmap_i) +
        "_alms_3_removed_both_overhang.log")
    print "After removing alignments with both overhangs, the number of alignments is ", num_alms

    # check overlap between alignments
    for r in qualify_alms:
        for q1 in qualify_alms[r]:
            if removed[r, q1] == True:
                continue
            x = qualify_alms[r][q1]
            for q2 in qualify_alms[r]:
                if removed[r, q2] == True:
                    continue
                y = qualify_alms[r][q2]
                if q1 >= q2:
                    continue
                if x.refstartpos <= y.refstartpos and y.refstartpos <= x.refendpos:
                    overlap = min(x.refendpos, y.refendpos) - y.refstartpos
                elif y.refstartpos <= x.refstartpos and x.refstartpos <= y.refendpos:
                    overlap = min(x.refendpos, y.refendpos) - x.refstartpos
                else:
                    overlap = 0
                if overlap >= 20000:
                    if x.confidence < y.confidence:
                        removed[r, q1] = True
                    else:
                        removed[r, q2] = True
    num_alms = 0
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            if removed[ref, qry] == False:
                num_alms += 1
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(
        current_alms,
        output_dir + "/opt_" + str(optmap_i) + "_alms_4_solved_overlaps.log")
    print "After removing one of two overlap alignments, the number of alignments is ", num_alms

    return current_alms, optmap, chimeric_pairs