コード例 #1
0
def main():
    id2seq = {}
    parser = argparse.ArgumentParser()
    parser.add_argument("-b",
                        "--breakpoint",
                        help="file containing breakpoints")
    parser.add_argument("-a",
                        "--assembly",
                        help="fasta file containing contigs")
    parser.add_argument("-o", "--outfile", help="new assembly file")
    parser.add_argument("-l", "--lenfile", help="length of contigs")

    args = parser.parse_args()

    lenfile = open(args.lenfile, 'w')

    lenmap = {}
    f = FastaReader(args.assembly)
    for record in f:
        id = record.id
        id2seq[id] = record.sequence[0:-10]
        new_seq = {}

        f = open(args.breakpoint, 'r')
        lines = f.readlines()
        for line in lines:
            attrs = line.split()
            if len(attrs) == 1:
                curr_contig = attrs[0]
                seq = id2seq[curr_contig]
            else:
                start = long(attrs[0])
                end = long(attrs[1])
                new_id = curr_contig + '_' + attrs[0] + '_' + attrs[1]
                new_seq[new_id] = seq[start:end]
                lenmap[new_id] = end - start + 1
        rec_list = []
        writer = FastaWriter(args.scaffold)
        for key in new_seq:
            writer.writeRecord(key, new_seq[key])

        for key in lenmap:
            lenfile.write(key + "\t" + str(lenmap[key]) + '\n')
コード例 #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-a", "--cleaned", help="cleaned assembly")
    parser.add_argument("-f", "--scaffold", help="final scaffold file")
    parser.add_argument("-l", "--links", help="links sorted by score")
    parser.add_argument("-n", "--length", help="contig length")
    args = parser.parse_args()

    f = FastaReader(args.cleaned)

    for record in f:
        id = record.id
        print id
        id2seq[id] = record.sequence[0:-10]

    def break_cycle(nodes):
        nodeset = set()
        for node in nodes:
            nodeset.add(node.split(":")[0])
        nodeset = list(nodeset)
        weight = ""
        chosen_edge = ""
        if len(nodeset) == 2:
            u = nodeset[0]
            v = nodeset[1]
            if u in nodes_to_edges:
                edges = nodes_to_edges[u]
                sorted_edges = sorted(edges,
                                      key=operator.itemgetter(2),
                                      reverse=True)
                #print sorted_edges
                for each in sorted_edges:
                    if each[1].split(":")[0] == v:
                        weight = each[2]
                        chosen_edge = each

            if v in nodes_to_edges:
                edges = nodes_to_edges[v]
                sorted_edges = sorted(edges,
                                      key=operator.itemgetter(2),
                                      reverse=True)
                for each in sorted_edges:
                    if each[1].split(":")[0] == u:
                        if each[2] > weight:
                            weight = each[2]
                            chosen_edge = each
                        if each[2] < weight:
                            break

            if chosen_edge != "":
                start = chosen_edge[0]
                end = chosen_edge[1]
                path = []
                if start.split(":")[1] == 'B':
                    path.append(start.split(":")[0] + ":E")
                    #path.append(start.split(":")[0]+":B")
                else:
                    path.append(start.split(":")[0] + ":B")
                    #path.append(start.split(":")[0]+":E")

                path.append(start)
                path.append(end)

                if end.split(":")[1] == 'B':
                    path.append(end.split(":")[0] + ":E")
                    #path.append(end.split(":")[0]+":B")
                else:
                    path.append(end.split(":")[0] + ":B")
                    #path.append(end.split(":")[0]+":E")

                return path

    with open(args.length, 'r') as f:
        lines = f.readlines()
        for line in lines:
            attrs = line.split()
            contig_length[attrs[0]] = long(attrs[-1])

    contigs = set()
    with open(args.links, 'r') as f:
        for row in f:
            row = row.strip().split()
            v1, v2 = row[0:2]
            score = float(row[-1])
            count = float(row[3])
            c1 = v1.split(":")[0]
            c2 = v2.split(":")[0]
            contigs.add(c1)
            contigs.add(c2)

            if c1 not in nodes_to_edges:
                nodes_to_edges[c1] = []
            if c2 not in nodes_to_edges:
                nodes_to_edges[c2] = []
            nodes_to_edges[c1].append((v1, v2, float(row[3])))

            key = c1 + '$' + c2
            if count >= 60:
                H.add_edge(c1, c2, weight=int(row[-1]))
                oriented.add_edge(v1, v2, weight=int(row[-1]))
            #H.add_edge(c1,c2,weight=int(row[-1]))
            #oriented.add_edge(v1,v2,weight=int(row[-1]),count=float(row[3]))

            #print key
            if key not in edgemap:
                edgemap[key] = int(row[-1])
            else:
                edgemap[key] += int(row[-1])

            key = c2 + '$' + c1
            if key not in edgemap:
                edgemap[key] = int(row[-1])
            else:
                edgemap[key] += int(row[-1])

            if v1 not in existing_nodes and v2 not in existing_nodes:
                if count < 150:
                    continue
                G.add_edge(v1, v2, score=score, t="x")

                existing_nodes.add(v1)
                existing_nodes.add(v2)

    for ctg in list(contigs):
        G.add_edge(ctg + ":B", ctg + ":E", t="c", score=0)

    g_idx = 1
    recs = []
    to_merge = set()

    backbone_paths = {}
    path_id = 1
    assigned = {}
    # for u,v,data in G.edges(data=True):
    #     if data['score'] == 0:
    #         G[u][v]['score'] = 1000000
    #         continue
    #     G[u][v]['score'] = 1.0/data['score']
    for subg in nx.connected_component_subgraphs(G):
        p0 = []

        for v in subg.nodes():
            if subg.degree(v) == 1:
                p0.append(v)

        if len(p0) != 2:
            path = break_cycle(subg.nodes())
            if path != None:
                #print path
                if len(path) == 2:
                    assigned[path[0].split(':')[0]] = False
                    to_merge.add(path[0].split(':')[0])
                    continue
                backbone_paths[path_id] = path
                path_id += 1
            else:
                #print 'here'
                #print subg.nodes()
                for each in subg.nodes():
                    to_merge.add(each.split(':')[0])
                    assigned[each.split(':')[0]] = False
            continue
        else:
            path = nx.shortest_path(subg, p0[0], p0[1])
            if len(path) == 2:
                to_merge.add(path[0].split(':')[0])
                continue
            backbone_paths[path_id] = path
            #print path
            path_id += 1
            curr_contig = ""

        g_idx += 1

    #now for each separate contig, find a maximum likely backbone path
    assignment = {}
    for each in to_merge:
        max_sum = -1
        max_path = -1
        for key in backbone_paths:
            path = backbone_paths[key]
            cur_sum = 0
            cnt = 0
            for node in path:
                if H.has_edge(each, node.split(':')[0]):
                    cur_sum += H[each][node.split(':')[0]]['weight']
                    cnt += 1
            if cnt != 0 and cur_sum > max_sum:
                max_sum = cur_sum
                max_path = key

        if max_sum != -1:
            assignment[each] = (max_path, max_sum, contig_length[each])

    #now that we have found the path, try putting contig at best position in the path

    count = len(assignment)

    path_to_contig = {}

    for each in assignment:
        key = assignment[each][0]
        if key not in path_to_contig:
            path_to_contig[key] = []
        path_to_contig[key].append(
            (each, assignment[each][1], assignment[each][2]))

    for each in path_to_contig:
        contigs = path_to_contig[each]
        contigs_sorted = sorted(contigs,
                                key=operator.itemgetter(1),
                                reverse=True)
        path_to_contig[each] = contigs_sorted
        #print contigs_sorted

    ofile = open('ambigous_contigs', 'w')

    for path_id in path_to_contig:
        path = backbone_paths[path_id]
        temp_path = list(path)

        contigs = path_to_contig[path_id]
        contigs = [str(i[0]) for i in contigs]
        explored = {}
        cnt = len(contigs)
        #print 'contig_length = ' + str(cnt)
        prev_len = -1
        curr_len = 0
        while True:
            final_max = -1
            final_pos = -1
            final_orient = ''
            final_contig = ''
            final_begin = ''
            final_end = ''
            #print len(explored)
            if len(explored) == len(contigs) or prev_len == len(explored):
                break
            prev_len = len(explored)
            for contig in contigs:
                if contig not in explored:
                    begin = contig + ":B"
                    end = contig + ":E"
                    total_max = -1
                    orientation = ''
                    pos = -1
                    #check for positions in the middle of the path
                    for i in range(1, len(path) - 1, 2):
                        score_fow = -1
                        score_rev = -1

                        if oriented.has_edge(path[i],
                                             begin) and oriented.has_edge(
                                                 end, path[i + 1]):
                            score_fow = oriented[
                                path[i]][begin]['weight'] + oriented[end][path[
                                    i + 1]]['weight']

                        if oriented.has_edge(path[i],
                                             end) and oriented.has_edge(
                                                 begin, path[i + 1]):
                            score_rev = oriented[
                                path[i]][end]['weight'] + oriented[begin][path[
                                    i + 1]]['weight']

                        if score_fow >= score_rev:
                            if score_fow > total_max:
                                total_max = score_fow
                                orientation = 'fow'
                                pos = i
                        else:
                            if score_rev > total_max:
                                total_max = score_rev
                                orientation = 'rev'
                                pos = i

                        #check for start and end
                        if oriented.has_edge(begin, path[0]):
                            score_fow = oriented[begin][path[0]]['weight']
                            if score_fow > total_max:
                                total_max = score_fow
                                orientation = 'fow'
                                pos = 0

                        if oriented.has_edge(end, path[0]):
                            score_rev = oriented[end][path[0]]['weight']
                            if score_rev > total_max:
                                total_max = score_rev
                                orientation = 'rev'
                                pos = 0

                        if oriented.has_edge(path[-1], begin):
                            score_fow = oriented[path[-1]][begin]['weight']
                            if score_fow > total_max:
                                total_max = score_fow
                                orientation = 'fow'
                                pos = len(path)

                        if oriented.has_edge(path[-1], end):
                            score_rev = oriented[path[-1]][end]['weight']
                            if score_rev > total_max:
                                total_max = score_rev
                                orientation = 'rev'
                                pos = len(path)

                if total_max > final_max:
                    final_max = total_max
                    final_pos = pos
                    final_orient = orientation
                    final_contig = contig
                    final_begin = begin
                    final_end = end

            if final_max > 70:
                #prev_len = len(explored)
                explored[final_contig] = 1
                if final_orient == 'fow':
                    if final_pos == 0:
                        path.insert(0, final_begin)
                        path.insert(0, final_end)
                    else:
                        if final_pos == len(path):
                            path.append(final_begin)
                            path.append(final_end)
                        else:
                            path.insert(final_pos + 1, final_begin)
                            path.insert(final_pos + 2, final_end)

                else:
                    if final_pos == 0:
                        path.insert(0, final_begin)
                        path.insert(0, final_end)
                    else:
                        if final_pos == len(path):
                            path.append(final_end)
                            path.append(final_begin)
                        else:
                            path.insert(final_pos + 1, final_end)
                            path.insert(final_pos + 2, final_begin)

            else:
                explored[final_contig] = 1

        backbone_paths[path_id] = path

    # for key in backbone_paths:
    #     if len(backbone_paths[key]) >= 4:
    #         print backbone_paths[key]

    # for key1 in backbone_paths:
    #     max_weight = 0
    #     max_path = ''
    #     for key2 in backbone_paths:
    #         if key1 != key2:
    #             path1 = backbone_paths[key1]
    #             path2 = backbone_paths[key2]
    #             weight = 0
    #             for contig1 in path1:
    #                 ctg1 = contig1.split(':')[0]
    #                 for contig2 in path2:
    #                     ctg2 = contig2.split(':')[0]
    #                     if H.has_edge(ctg1,ctg2):
    #                         weight += H[ctg1][ctg2]['weight']
    #             if weight > max_weight:
    #                 max_weight = weight
    #                 max_path = key2

    #     if max_path != '' and 1000 < max_weight < 4000:
    #         print backbone_paths[key1], backbone_paths[max_path], max_weight

    c_id = 1
    writer = FastaWriter(args.scaffold)
    for key in backbone_paths:
        if len(backbone_paths[key]) >= 4:
            path = backbone_paths[key]
            curr_contig = ""
            print c_id
            for i in range(0, len(path) - 1, 2):
                curr = path[i]
                next = path[i + 1]
                curr = curr.split(':')
                next = next.split(':')
                print curr
                if curr[1] == 'B' and next[1] == 'E':
                    curr_contig += id2seq[curr[0]]
                if curr[1] == 'E' and next[1] == 'B':
                    #print id2seq[curr[0]]
                    curr_contig += revcompl(id2seq[curr[0]])
                if i != len(path) - 2:
                    for j in range(0, 500):
                        curr_contig += 'N'
            # rec = SeqRecord(Seq(curr_contig,generic_dna),id='scaffold_'+str(c_id))
            # recs.append(rec)
            print c_id
            writer.writeRecord('scaffold_' + str(c_id), curr_contig)
            c_id += 1