Example #1
0
def pre_process(optmap_i, optmap_file, myfile, myfile2, output_dir,
                min_confidence):
    header_lines = 10
    header = []
    minrefoverhang = 50000
    minqryoverhang = 50000

    all_alms = {
    }  # stores all the Alignments for all groups, all_groups[ref] should contain molecule ref
    qualify_alms = {
    }  # only keep one alignment(the one with highest confidence) for each contig in one molecule
    removed = {
    }  # removed[ref,qry] == True means alignment for (ref, qry) is already removed

    # collecting alignments and store in all_groups
    print '---------------read .xmap file-------------------'
    with open(myfile + '_flip.xmap', 'rb') as csvfile:
        csvreader = csv.reader(csvfile, delimiter='\t')
        for i in range(header_lines):  # 10 lines of header
            header.append(csvreader.next())  # save them
        # read the first non-header line
        while True:
            try:
                row = csvreader.next()
                x = Alignment(int(row[1]), int(row[2]), float(row[3]),
                              float(row[4]), float(row[5]), float(row[6]),
                              row[7], float(row[8]), row[9], float(row[10]),
                              float(row[11]), int(row[12]), row[13])
                if x.ref not in all_alms:
                    all_alms[x.ref] = [x]
                else:
                    all_alms[x.ref].append(x)
            except StopIteration:
                break
    num_all_alms = 0
    for ref in all_alms:
        num_all_alms += len(all_alms[ref])
    print "In total, the number of alignments collected is ", num_all_alms

    # only keep one alignment(the one with highest confidence) for each contig in one molecule
    for ref in all_alms:
        group = all_alms[ref]
        qry_bestx = {}
        for x in group:
            if x.qry not in qry_bestx:
                qry_bestx[x.qry] = x
            else:
                if x.confidence > qry_bestx[x.qry].confidence:
                    qry_bestx[x.qry] = x

        qualify_alms[ref] = {}
        for qry in qry_bestx:
            qualify_alms[ref][qry] = qry_bestx[qry]

    num_qualify_alms = 0
    for ref in qualify_alms:
        num_qualify_alms += len(qualify_alms[ref])
    # initialize removed array
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            removed[ref, qry] = False
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(current_alms,
                output_dir + "/opt_" + str(optmap_i) + "_alms_0_initial.log")
    print "In total, the number of alignments in qualify_alms is ", num_qualify_alms

    # remove low confidence alignments
    print '---------------Remove low quality alignments---------------'
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            x = qualify_alms[ref][qry]
            if x.confidence < min_confidence:
                removed[ref, qry] = True
                print 'alignment (', ref, ',', qry, ') is low quality and removed'
    num_alms = 0
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            if removed[ref, qry] == False:
                num_alms += 1
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(
        current_alms,
        output_dir + "/opt_" + str(optmap_i) + "_alms_1_removed_low_conf.log")
    print "After removing low confidence alignments, the number of alignments is ", num_alms
    print '---------------End---------------'

    # read optical map
    optmap = {}
    with open(optmap_file) as f_map:
        for line in f_map:
            line = line.strip()
            if line[0] == '#':
                continue
            cols = line.split('\t')
            CMapId = int(cols[0])
            LabelChannel = cols[4]
            Position = float(cols[5])

            if CMapId not in optmap:
                optmap[CMapId] = []
            if LabelChannel == "1":
                optmap[CMapId].append(Position)
    for CMapId in optmap:
        optmap[CMapId].sort()

    print '---------------scaling-------------------'
    # calculating scaling
    qry_len = {}
    with open(myfile2 + '_key.txt') as f_key:
        for i in range(0, 4):  # 4 header lines
            f_key.readline()
        for line in f_key:
            line = line.strip()
            cols = line.split('\t')
            qry_id = int(cols[0])
            seq_len = int(cols[2])
            qry_len[qry_id] = seq_len
    scaling = 0
    num = 0
    with open(myfile + '_r.cmap') as f_q:
        for i in range(0, 11):  # 11 header lines
            f_q.readline()
        for line in f_q:
            line = line.strip()
            cols = line.split('\t')
            qry_id = int(cols[0])
            appr_len = float(cols[1])
            seq_len = qry_len[qry_id]
            scaling += appr_len / seq_len
            num += 1
    scaling /= num  # scaling=1.02258059775
    scaling = 1.0
    # use scaling to adjsut coordinates of alignments
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            x = qualify_alms[ref][qry]
            x.qrystartpos /= scaling
            x.qryendpos /= scaling
            x.qrylen /= scaling
            x.refstartpos /= scaling
            x.refendpos /= scaling
            x.reflen /= scaling

    # use scaling to adjsut coordinates of optial map
    for ref in optmap:
        for i in range(0, len(optmap[ref])):
            optmap[ref][i] /= scaling

    print '---------------END-------------------'

    # find the reference-based coordinates for each contig
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            x = qualify_alms[ref][qry]
            if (x.orientation == '+'):
                x.qry_left_overlen = x.qrystartpos
                x.qry_right_overlen = x.qrylen - x.qryendpos
            else:
                x.qry_left_overlen = x.qrylen - x.qrystartpos
                x.qry_right_overlen = x.qryendpos
            x.start = x.refstartpos - x.qry_left_overlen
            x.end = x.refendpos + x.qry_right_overlen
            x.ref_left_overlen = x.refstartpos
            x.ref_right_overlen = x.reflen - x.refendpos
            if (x.orientation == '+'):
                x.refstart = x.qrystartpos - x.ref_left_overlen
                x.refend = x.qryendpos + x.ref_right_overlen
            else:
                x.refstart = x.qryendpos - x.ref_right_overlen
                x.refend = x.qrystartpos + x.ref_left_overlen

    num_alms = 0
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            if removed[ref, qry] == False:
                num_alms += 1
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(current_alms,
                output_dir + "/opt_" + str(optmap_i) + "_alms_2_scaled.log")
    print "After scaling, the number of alignments is ", num_alms

    # read qry map
    qry_markers = {}
    with open(myfile + '_r.cmap') as f_q:
        for i in range(11):  # 10 lines of header
            header_line = f_q.readline()
        for line in f_q:
            line = line.strip()
            cols = line.split('\t')
            CMapId = int(cols[0])
            ContigLength = float(cols[1])
            NumSites = int(cols[2])
            SiteID = int(cols[3])
            LabelChannel = cols[4]
            Position = float(cols[5])
            if LabelChannel == "0":
                continue
            if CMapId not in qry_markers:
                qry_markers[CMapId] = []
            Position /= scaling
            qry_markers[CMapId].append(Position)
    for CMapId in qry_markers:
        qry_markers[CMapId].sort()
    f_q.close()

    print '---------------candidate cutting sites-------------------'
    fpair = file(output_dir + "/chimeric_pairs_" + str(optmap_i) + ".log", 'w')
    fpair.write("ref_id\tref_pos\tqry_id\tqry_pos\n")
    chimeric_pairs = []

    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            if removed[ref, qry] == True:
                continue
            x = qualify_alms[ref][qry]

            if (x.confidence > min_confidence):
                ref_left_overlen = x.refstartpos
                ref_right_overlen = x.reflen - x.refendpos
                flag_left = False
                flag_right = False
                if (x.qry_left_overlen > minqryoverhang
                        and ref_left_overlen > minrefoverhang
                        and markers_in_qry_left_overhang(qry_markers, x) > 0):
                    flag_left = True
                    chimeric_pairs.append(
                        (x.ref, x.refstartpos, x.qry, x.qrystartpos))
                    print(
                        x.ref, x.refstartpos, x.qry,
                        x.qrystartpos), "is a pair of candidate cutting sites"
                    fpair.write(
                        str(x.ref) + "\t" + str(x.refstartpos) + "\t" +
                        str(x.qry) + "\t" + str(x.qrystartpos) + "\n")
                if (x.qry_right_overlen > minqryoverhang
                        and ref_right_overlen > minrefoverhang
                        and markers_in_qry_right_overhang(qry_markers, x) > 0):
                    flag_right = True
                    chimeric_pairs.append(
                        (x.ref, x.refendpos, x.qry, x.qryendpos))
                    print(x.ref, x.refendpos, x.qry,
                          x.qryendpos), "is a pair of candidate cutting sites"
                    fpair.write(
                        str(x.ref) + "\t" + str(x.refendpos) + "\t" +
                        str(x.qry) + "\t" + str(x.qryendpos) + "\n")
                if flag_left == True and flag_right == True:
                    removed[ref, qry] = True
    fpair.close()
    print '---------------END-------------------'
    num_alms = 0
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            if removed[ref, qry] == False:
                num_alms += 1
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(
        current_alms, output_dir + "/opt_" + str(optmap_i) +
        "_alms_3_removed_both_overhang.log")
    print "After removing alignments with both overhangs, the number of alignments is ", num_alms

    # check overlap between alignments
    for r in qualify_alms:
        for q1 in qualify_alms[r]:
            if removed[r, q1] == True:
                continue
            x = qualify_alms[r][q1]
            for q2 in qualify_alms[r]:
                if removed[r, q2] == True:
                    continue
                y = qualify_alms[r][q2]
                if q1 >= q2:
                    continue
                if x.refstartpos <= y.refstartpos and y.refstartpos <= x.refendpos:
                    overlap = min(x.refendpos, y.refendpos) - y.refstartpos
                elif y.refstartpos <= x.refstartpos and x.refstartpos <= y.refendpos:
                    overlap = min(x.refendpos, y.refendpos) - x.refstartpos
                else:
                    overlap = 0
                if overlap >= 20000:
                    if x.confidence < y.confidence:
                        removed[r, q1] = True
                    else:
                        removed[r, q2] = True
    num_alms = 0
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            if removed[ref, qry] == False:
                num_alms += 1
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(
        current_alms,
        output_dir + "/opt_" + str(optmap_i) + "_alms_4_solved_overlaps.log")
    print "After removing one of two overlap alignments, the number of alignments is ", num_alms

    return current_alms, optmap, chimeric_pairs
Example #2
0
def mtp(myfile, myfile2, output_dir, GLPSOL, false_alm_threshold,
        min_confidence):

    # discard alignments below min_confidence
    #min_confidence = 25
    header_lines = 10
    header = []
    # alignment overhangs above this number of bps are considered chimeric
    #minrefoverhang = 100000
    #minqryoverhang = 100000

    all_alms = {
    }  # stores all the Alignments for all groups, all_groups[ref] should contain molecule ref
    qualify_alms = {
    }  # only keep one alignment(the one with highest confidence) for each contig in one molecule
    removed = {
    }  # removed[ref,qry] == True means alignment for (ref, qry) is already removed

    # collecting alignments and store in all_groups
    print '---------------read .xmap file-------------------'
    with open(myfile + '.xmap', 'rb') as csvfile:
        csvreader = csv.reader(csvfile, delimiter='\t')
        for i in range(header_lines):  # 10 lines of header
            header.append(csvreader.next())  # save them
        # read the first non-header line
        while True:
            try:
                row = csvreader.next()
                x = Alignment(int(row[1]), int(row[2]), float(row[3]),
                              float(row[4]), float(row[5]), float(row[6]),
                              row[7], float(row[8]), row[9], float(row[10]),
                              float(row[11]), int(row[12]), row[13])
                if x.ref not in all_alms:
                    all_alms[x.ref] = [x]
                else:
                    all_alms[x.ref].append(x)
            except StopIteration:
                break
    num_all_alms = 0
    for ref in all_alms:
        #print 'collected', len(all_alms[ref]), 'alignments for molecule', ref
        num_all_alms += len(all_alms[ref])
    print "In total, the number of alignments collected is ", num_all_alms

    # only keep one alignment(the one with highest confidence) for each contig in one molecule
    for ref in all_alms:
        group = all_alms[ref]
        qry_bestx = {}
        for x in group:
            if x.qry not in qry_bestx:
                qry_bestx[x.qry] = x
            else:
                if x.confidence > qry_bestx[x.qry].confidence:
                    qry_bestx[x.qry] = x

        qualify_alms[ref] = {}
        for qry in qry_bestx:
            qualify_alms[ref][qry] = qry_bestx[qry]

    num_qualify_alms = 0
    for ref in qualify_alms:
        num_qualify_alms += len(qualify_alms[ref])
    print "In total, the number of alignments in qualify_alms is ", num_qualify_alms

    # initialize removed array
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            removed[ref, qry] = False

    # find the reference-based coordinates for each alignments
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            x = qualify_alms[ref][qry]
            if (x.orientation == '+'):
                x.qry_left_overlen = x.qrystartpos
                x.qry_right_overlen = x.qrylen - x.qryendpos
            else:
                x.qry_left_overlen = x.qrylen - x.qrystartpos
                x.qry_right_overlen = x.qryendpos
            x.start = x.refstartpos - x.qry_left_overlen
            x.end = x.refendpos + x.qry_right_overlen

    current_alms = copy_alms(qualify_alms, removed)
    output_alms(current_alms, output_dir + "/alms_0_initial.log")
    print "Initially, the number of alignments is", count_alms(current_alms)
    alms_0 = copy_alms(qualify_alms, removed)
    aligned_contigs = different_contigs(alms_0, {})
    output_contigs(aligned_contigs, myfile + '_aligned.txt')
    print '---------------END-------------------'

    # remove low confidence alignments
    print '---------------Remove low quality alignments---------------'
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            x = qualify_alms[ref][qry]
            if x.confidence < min_confidence:
                removed[ref, qry] = True
                print 'alignment (', ref, ',', qry, ') is low quality and removed'
    num_alms = 0
    for ref in qualify_alms:
        for qry in qualify_alms[ref]:
            if removed[ref, qry] == False:
                num_alms += 1

    current_alms = copy_alms(qualify_alms, removed)
    output_alms(current_alms, output_dir + "/alms_1_removed_lowconf.log")
    print "After removing low confidence alignments, the number of alignments is", count_alms(
        current_alms)
    alms_1 = copy_alms(qualify_alms, removed)
    lowconf_contigs = different_contigs(alms_0, alms_1)
    output_contigs(lowconf_contigs, myfile + '_lowconf.txt')
    print '---------------End---------------'

    print '---------------removing false positive alignments-------------------'
    current_alms = copy_alms(qualify_alms, removed)
    false_alms(GLPSOL, false_alm_threshold, current_alms, removed, output_dir)
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(current_alms, output_dir + "/alms_2_removed_false_alms.log")
    print "After removing false positive alignments, the number of alignments is", count_alms(
        current_alms)
    print '---------------END-------------------'

    print '---------------removing contained contigs locally-------------------'
    for ref in qualify_alms:
        for q1 in qualify_alms[ref]:
            x = qualify_alms[ref][q1]
            for q2 in qualify_alms[ref]:
                if q2 <= q1:
                    continue
                y = qualify_alms[ref][q2]
                if (x.start >= y.start) and (x.end <= y.end):
                    removed[ref, q1] = True
                    print[
                        ref, q1
                    ], "alignment is removed becasue it's contained in alignment", [
                        ref, q2
                    ]
                elif (y.start >= x.start) and (y.end <= x.end):
                    removed[ref, q2] = True
                    print[
                        ref, q2
                    ], "alignment is removed becasue it's contained in alignment", [
                        ref, q1
                    ]
    current_alms = copy_alms(qualify_alms, removed)
    output_alms(current_alms,
                output_dir + "/alms_3_removed_contained_locally.log")
    print "After removing contained alignments locally, the number of alignments is", count_alms(
        current_alms)
    print '---------------END-------------------'

    #build the mst
    print '---------------building the mst-------------------'
    fo = file(output_dir + "/ugraph_1.log", 'w')
    current_alms = copy_alms(qualify_alms, removed)
    forest, vertex_orientations = get_mst(current_alms, fo)
    fo.close()
    output_forest(forest, vertex_orientations, output_dir + "/forest_1.log")
    print '---------------END-------------------'
    # unify the coordinates
    print '---------------unifying the coordinates-------------------'
    current_alms = copy_alms(qualify_alms, removed)
    unify_alms = unify_coords(output_dir, current_alms, forest,
                              vertex_orientations)

    removed_unify = {}
    for root in unify_alms:
        for qry in unify_alms[root]:
            removed_unify[root, qry] = False

    contigs = set([])
    for root in unify_alms:
        for qry in unify_alms[root]:
            if qry in contigs:
                print qry, "appears in more than 1 trees"
            contigs.add(qry)
    current_alms = copy_alms(unify_alms, removed_unify)
    output_alms(current_alms, output_dir + "/alms_4_unified.log")
    print "After unifying the coordinates, the number of alignments is", count_alms(
        current_alms)
    print '---------------END-------------------'

    print '---------------removing contained contigs globally-------------------'
    contained = set([])
    for root in unify_alms:
        for q1 in unify_alms[root]:
            x = unify_alms[root][q1]
            for q2 in unify_alms[root]:
                if q2 <= q1:
                    continue
                y = unify_alms[root][q2]
                if (q2 not in contained) and (x.start >= y.start) and (x.end <=
                                                                       y.end):
                    contained.add(q1)
                    removed_unify[root, q1] = True
                    print[
                        root, q1
                    ], "alignment is removed becasue it's contained in alignment", [
                        root, q2
                    ]
                elif (q1 not in contained) and (y.start >=
                                                x.start) and (y.end <= x.end):
                    contained.add(q2)
                    removed_unify[root, q2] = True
                    print[
                        root, q2
                    ], "alignment is removed becasue it's contained in alignment", [
                        root, q1
                    ]
    for root in unify_alms:
        for qry in unify_alms[root]:
            if qry in contained and removed_unify[root, qry] == False:
                removed_unify[root, qry] = True
                print[root, qry
                      ], "alignment is removed because qry is contained contig"

    current_alms = copy_alms(unify_alms, removed_unify)
    output_alms(current_alms,
                output_dir + "/alms_5_removed_contained_globally.log")
    print "After removing contained contigs globally, the number of alignments is", count_alms(
        current_alms)
    print '---------------END-------------------'

    #build new mst
    print '---------------building new mst-------------------'
    fo = file(output_dir + "/ugraph_2.log", 'w')
    current_alms = copy_alms(unify_alms, removed_unify)
    forest_unify, vertex_orientations_unify = get_mst(current_alms, fo)
    fo.close()
    output_forest(forest_unify, vertex_orientations_unify,
                  output_dir + "/forest_2.log")
    print '---------------END-------------------'

    print '---------------merging DAGs-------------------'
    current_alms = copy_alms(unify_alms, removed_unify)
    DAGs = merge_DAGs(current_alms, forest_unify, vertex_orientations_unify)
    output_DAGs(DAGs, output_dir + "/dags.log")
    print '---------------END-------------------'

    #DAG to mtp contig set
    print '---------------mtp-------------------'
    mtp_node_set = get_subDAGs(DAGs, output_dir)
    current_alms = copy_alms(unify_alms, removed_unify)
    mtp = []
    for ref in current_alms:
        for qry in current_alms[ref]:
            x = current_alms[ref][qry]
            if qry in mtp_node_set:
                mtp.append(x)
            else:
                removed_unify[ref, qry] = True
    mtp.sort(key=lambda x: (x.ref, x.start))
    print "In total, the number of alignments in mtp is", len(mtp)

    current_alms = copy_alms(unify_alms, removed_unify)
    output_alms(current_alms, output_dir + "/alms_6_mtp.log")
    print '---------------END-------------------'

    print '---------------scaling-------------------'
    # calculating scaling
    qry_len = {}
    with open(myfile2 + '_key.txt') as f_key:
        for i in range(0, 4):  # 4 header lines
            f_key.readline()
        for line in f_key:
            line = line.strip()
            cols = line.split('\t')
            qry_id = int(cols[0])
            seq_len = int(cols[2])
            qry_len[qry_id] = seq_len
    scaling = 0
    num = 0
    with open(myfile + '_q.cmap') as f_q:
        for i in range(0, 11):  # 11 header lines
            f_q.readline()
        for line in f_q:
            line = line.strip()
            cols = line.split('\t')
            qry_id = int(cols[0])
            appr_len = float(cols[1])
            seq_len = qry_len[qry_id]
            scaling += appr_len / seq_len
            num += 1
    scaling /= num  # scaling=1.02258059775

    print '---------------outputing-------------------'
    # save the MTP in a new xmap file and count the number of unitigs in each assembly
    with open(myfile + '_list.txt', 'wb') as listfile:
        with open(myfile + '_mtp.xmap', 'wb') as csvfile:
            csvwriter = csv.writer(csvfile, delimiter='\t')
            # copies the old xmap header
            for x in header:
                csvwriter.writerow(x)
            i = 1  # progressive number
            # for steve ->
            #	    scaling = 1.02257561752017878915 # scaling fact from opt map to BP
            previous = 0  # previous qry contig, to remove dups
            for x in mtp:
                # save the contig in listfile only if it is a new one
                if (x.qry != previous):
                    #listfile.write(str(x.qry)+'\n')
                    previous = x.qry

# for steve ->
                listfile.write(
                    str(x.ref) + '\t' + str(x.qry) + '\t' +
                    str(int(round(float(x.start) / scaling))) + '\t' +
                    str(int(round(float(x.end) / scaling))) + '\t' +
                    x.orientation + '\n')
                # dump the alignment
                csvwriter.writerow([i] + x.unpack())
                i += 1
    csvfile.close()
    listfile.close()