Example #1
0
def InitializeObjects(
    bam_file, Contigs, Scaffolds, param, Information, G_prime, small_contigs, small_scaffolds, C_dict
):
    singeled_out = 0
    contig_threshold = param.contig_threshold
    cont_lengths = bam_file.lengths
    cont_lengths = [int(nr) for nr in cont_lengths]  # convert long to int object
    cont_names = bam_file.references

    # Calculate NG50 and LG 50
    param.tot_assembly_length = sum(cont_lengths)
    sorted_lengths = sorted(cont_lengths, reverse=True)
    N50, L50 = CalculateStats(sorted_lengths, [], param, Information)
    param.current_L50 = L50
    param.current_N50 = N50
    # extend_paths = param.extend_paths
    counter = 0
    start = time()
    for i in range(0, len(cont_names)):
        counter += 1
        if counter % 100000 == 0:
            print >> Information, "Time adding 100k keys", time() - start
            start = time()
        if cont_names[i] not in C_dict:
            # errorhandle.unknown_contig(cont_names[i])
            continue

        if cont_lengths[i] >= contig_threshold:
            C = Contig.contig(cont_names[i])  # Create object contig
            C.length = cont_lengths[i]
            C.sequence = C_dict[cont_names[i]]
            del C_dict[cont_names[i]]
            scaf_length = C.length  # Initially, scaffold consists of only this contig
            C.direction = True  # always in same direction first, False=reverse
            C.position = 0  # position always 0
            # C.links = {}
            Contigs[C.name] = C  # Create a dict with name as key and the object container as value
            S = Scaffold.scaffold(param.scaffold_indexer, [C], scaf_length)  # Create object scaffold
            Scaffolds[S.name] = S
            C.scaffold = S.name
            param.scaffold_indexer += 1
        else:
            if cont_lengths[i] > 0:  # In case of contigs with size 0 (due to some error in fasta file)
                C = Contig.contig(cont_names[i])  # Create object contig
                C.length = cont_lengths[i]
                C.sequence = C_dict[cont_names[i]]
                del C_dict[cont_names[i]]
                scaf_length = C.length  # Initially, scaffold consists of only this contig
                C.direction = True  # always in same direction first, False=reverse
                C.position = 0  # position always 0
                small_contigs[C.name] = C  # Create a dict with name as key and the object container as value
                S = Scaffold.scaffold(param.scaffold_indexer, [C], scaf_length)  # Create object scaffold
                small_scaffolds[S.name] = S
                C.scaffold = S.name
                param.scaffold_indexer += 1
                singeled_out += 1
    del C_dict

    print >> Information, "Nr of contigs that was singeled out due to length constraints " + str(singeled_out)
    return ()
Example #2
0
def NewContigsScaffolds(G, G_prime, Contigs, small_contigs, Scaffolds, small_scaffolds, Information, dValuesTable, param, already_visited):
### Remaining scaffolds are true sensible scaffolds, we must now update both the library of scaffold objects and the library of contig objects
    new_scaffolds_ = nx.connected_component_subgraphs(G)
    print >> Information, 'Nr of new scaffolds created in this step: ' + str(len(new_scaffolds_))
    for new_scaffold_ in new_scaffolds_:
        param.scaffold_indexer += 1
        #scaf_size=len(new_scaffold_)
        scaffold_length = 0
        contig_list = []

        ##### Here PathExtension algorithm is called if PRO is activated #####
        if param.extend_paths:
            PROWithinScaf(G, G_prime, Contigs, small_contigs, Scaffolds, small_scaffolds, param, new_scaffold_, dValuesTable, already_visited)

        for node in new_scaffold_:
            if len(G.neighbors(node)) == 1:
                start = node
                break
        for node in new_scaffold_:
            if len(G.neighbors(node)) == 1 and node != start:
                end = node
        #Create info to new scaffold object such as total length and the contig objects included

        prev_node = ('', '')
        pos = 0
        (G, contig_list, scaffold_length) = UpdateInfo(G, Contigs, small_contigs, Scaffolds, small_scaffolds, start, prev_node, pos, contig_list, scaffold_length, dValuesTable, param)
        S = Scaffold.scaffold(param.scaffold_indexer, contig_list, scaffold_length, defaultdict(constant_large), defaultdict(constant_large), defaultdict(constant_small), defaultdict(constant_small))  #Create the new scaffold object 

        Scaffolds[S.name] = S        #include in scaffold library

        if param.extend_paths:
            # Find the ends of the old subgraph new_scaffold_. We want them to be able to relabel these end nodes as the new sides on the new scaffold object created
            #only these ends are allowed to have links because they are of size mean+ 4*sigma so nothing is supposed to span over.

            #add the new scaffold object to G_prime

            G_prime.add_node((S.name, 'L'))  #start node
            G_prime.add_node((S.name, 'R'))  # end node
            G_prime.add_edge((S.name, 'L'), (S.name, 'R'), nr_links=None)
            try:
                for nbr in G_prime.neighbors(start):
                    nr_links_ = G_prime[start][nbr]['nr_links']
                    if nr_links_:
                        obs_ = G_prime[start][nbr]['obs']
                        G_prime.add_edge((S.name, 'L'), nbr, nr_links=nr_links_, obs=obs_)

                for nbr in G_prime.neighbors(end):
                    nr_links_ = G_prime[end][nbr]['nr_links']
                    if nr_links_:
                        obs_ = G_prime[end][nbr]['obs']
                        G_prime.add_edge((S.name, 'R'), nbr, nr_links=nr_links_, obs=obs_)

                #remove the old scaffold objects from G_prime
                G_prime.remove_nodes_from(new_scaffold_)
            except nx.exception.NetworkXError:
                pass

    return(Contigs, Scaffolds, param)
 def create_scaffold(self, phrase_strings):
     
     scaffold = Scaffold()
             
     for i in range(0, len(phrase_strings)):
         
         scaffold.article.append(phrase_strings[i])
         
         phrase = self._phrase_maker.create_phrase(phrase_strings[i])
         
         #scaffold.article.append(phrase.tokens)
         self.find_semantics(phrase, i)
         
         if phrase.is_numerical_data:
             scaffold.numerical_data.append(i)
         if phrase.is_date_time:
             scaffold.datetimes.append(i)
         if phrase.is_quote:
             scaffold.quotes.append(i)
         
             
     
     scaff_persons = {}
     
     for p in self._persons:
         scaff_persons[str(self._persons[p][0]) + str(p)] = self._persons[p][1]
         
     scaffold.persons.update(scaff_persons)
     self._persons.clear() 
     scaff_persons.clear()
             
     scaffold.locations.update(self._locations)
     self._locations.clear()     
     
     scaffold.named_entities.update(self._named_entities)
     self._named_entities.clear()  
     
     scaffold.longest_entry = self._longest_entry
     self._longest_entry = 0
     
     return scaffold
Example #4
0
def NewContigsScaffolds(G, Contigs, Scaffolds, F, Information, C_dict,
                        dValuesTable, param):
    ### Remaining scaffolds are true sensible scaffolds, we must now update both the library of scaffold objects and the library of contig objects
    new_scaffolds_ = list(nx.connected_component_subgraphs(G))
    print 'Nr of new scaffolds created: ' + str(len(new_scaffolds_))
    print >> Information, 'Nr of new scaffolds created in this step: ' + str(
        len(new_scaffolds_))
    for new_scaffold_ in new_scaffolds_:
        param.scaffold_indexer += 1
        #scaf_size=len(new_scaffold_)
        scaffold_length = 0
        contig_list = []

        #Store nr_of links between contigs before "destroying" the graph
        for edge in new_scaffold_.edges_iter():
            nr_links = G[edge[0]][edge[1]]['nr_links']
            side1 = edge[0][1]
            side2 = edge[1][1]
            if nr_links:
                contig_objects1 = Scaffolds[edge[0][0]].contigs
                contig_objects2 = Scaffolds[edge[1][0]].contigs
                GiveLinkConnection(Contigs, contig_objects1, contig_objects2,
                                   side1, side2, nr_links)

        for node in new_scaffold_:
            if len(G.neighbors(node)) == 1:
                break

        #Create info to new scaffold object such as total length and the contig objects included

        prev_node = ('', '')
        pos = 0
        (G, contig_list,
         scaffold_length) = UpdateInfo(G, Contigs, Scaffolds, node, prev_node,
                                       pos, contig_list, scaffold_length,
                                       C_dict, dValuesTable, param)
        S = Scaffold.scaffold(param.scaffold_indexer, contig_list,
                              scaffold_length, {},
                              {})  #Create the new scaffold object

        Scaffolds[S.name] = S  #include in scaffold library

    return (Contigs, Scaffolds, F, param)
Example #5
0
def NewContigsScaffolds(G, Contigs, Scaffolds, F, Information, C_dict, dValuesTable, param):
    ### Remaining scaffolds are true sensible scaffolds, we must now update both the library of scaffold objects and the library of contig objects
    new_scaffolds_ = list(nx.connected_component_subgraphs(G))
    print "Nr of new scaffolds created: " + str(len(new_scaffolds_))
    print >> Information, "Nr of new scaffolds created in this step: " + str(len(new_scaffolds_))
    for new_scaffold_ in new_scaffolds_:
        param.scaffold_indexer += 1
        # scaf_size=len(new_scaffold_)
        scaffold_length = 0
        contig_list = []

        # Store nr_of links between contigs before "destroying" the graph
        for edge in new_scaffold_.edges_iter():
            nr_links = G[edge[0]][edge[1]]["nr_links"]
            side1 = edge[0][1]
            side2 = edge[1][1]
            if nr_links:
                contig_objects1 = Scaffolds[edge[0][0]].contigs
                contig_objects2 = Scaffolds[edge[1][0]].contigs
                GiveLinkConnection(Contigs, contig_objects1, contig_objects2, side1, side2, nr_links)

        for node in new_scaffold_:
            if len(G.neighbors(node)) == 1:
                break

        # Create info to new scaffold object such as total length and the contig objects included

        prev_node = ("", "")
        pos = 0
        (G, contig_list, scaffold_length) = UpdateInfo(
            G, Contigs, Scaffolds, node, prev_node, pos, contig_list, scaffold_length, C_dict, dValuesTable, param
        )
        S = Scaffold.scaffold(
            param.scaffold_indexer, contig_list, scaffold_length, {}, {}
        )  # Create the new scaffold object

        Scaffolds[S.name] = S  # include in scaffold library

    return (Contigs, Scaffolds, F, param)
Example #6
0
    if len(paths) > 1:
        ScorePaths(G_prime, nodes_present_in_path, paths, all_paths_sorted_wrt_score,param)
#        for path in all_paths_sorted_wrt_score:
#            print path
        if len(all_paths_sorted_wrt_score) > 0:
            #all_paths_sorted_wrt_score = ExtendScaffolds(all_paths_sorted_wrt_score)
            return all_paths_sorted_wrt_score
            #return(all_paths_sorted_wrt_score[-1][2], all_paths_sorted_wrt_score[-1][1], all_paths_sorted_wrt_score[-1][0], all_paths_sorted_wrt_score[-1][3]) #return(all_paths_sorted_wrt_score) #
    return []
    #return([], 0, 0, 0)

if __name__ == '__main__':
    import Scaffold
    small_scaffolds_test = {}
    for i in range(1, 7):
        S = Scaffold.scaffold(i, 0, 0, {}, {})
        small_scaffolds_test[S.name] = S
    start = time()
    G_prime = nx.Graph()
    #G.add_nodes_from([(1, 'L'), (1, 'R'), (2, 'L'), (2, 'R'), (3, 'L'), (3, 'R'), (4, 'L'), (4, 'R'), (5, 'L'), (5, 'R')]) 
    for i in range(1, 7):
        G_prime.add_edge((i, 'L'), (i, 'R'), {'nr_links':0})
    G_prime.add_edges_from([((1, 'R'), (2, 'R'), {'nr_links':1}), ((3, 'L'), (4, 'L'), {'nr_links':1}), ((2, 'L'), (3, 'R'), {'nr_links':1}), ((1, 'R'), (5, 'L'), {'nr_links':2}),
                       ((5, 'R'), (4, 'L'), {'nr_links':3}), ((2, 'L'), (5, 'L'), {'nr_links':2}), ((1, 'R'), (4, 'L'), {'nr_links':8}), ((2, 'L'), (6, 'L'), {'nr_links':3}),
                       ((1, 'L'), (4, 'R'), {'nr_links':1}), ((1, 'L'), (4, 'L'), {'nr_links':1}), ((3, 'L'), (4, 'R'), {'nr_links':1}),
                        ((1, 'R'), (2, 'L'), {'nr_links':1}), ((1, 'R'), (5, 'R'), {'nr_links':1}), ((2, 'L'), (5, 'R'), {'nr_links':1})])
    G = nx.Graph()
    G.add_nodes_from([(1, 'L'), (1, 'R'), (4, 'L'), (4, 'R'), (6, 'L'), (6, 'R')])
    contigs = [1, 2, 3, 4, 5, 6]

    print 'Between'
 def AddEdges(Contigs,Scaffolds,bamfile,mean,std_dev,scaffold_indexer,F,read_len):
     #Clean contig_library
     singeled_out=0
     cont_lengths= bam_file.lengths
     cont_lengths=[int(nr) for nr in cont_lengths]  #convert long to int object
     #print cont_lengths
     cont_names = bam_file.references
     ####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING, 
     ####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE
     for i in range(0,len(cont_names)):
         if cont_lengths[i] >= 300:
             C=Contig.contig(cont_names[i])   # Create object contig
             C.length = cont_lengths[i]
             C.scaf_length = C.length        # Initially, scaffold consists of only this contig
             C.direction = True              # always in same direction first, False=reverse
             C.position = 0                  #position always 0
             C.links = {}
             Contigs[C.name] = C              # Create a dict with name as key and the object container as value
             S=Scaffold.scaffold('s'+str(scaffold_indexer),[C],C.length)  # Create object scaffold
             Scaffolds[S.name]=S
             C.scaffold=S.name
             G.add_node((S.name,'L'),length=cont_lengths[i])
             G.add_node((S.name,'R'),length=cont_lengths[i])
             scaffold_indexer+=1
     
     
     #Create "node graph" of contigs (that passed the length criteria). Each having a left and right node
     #print 'Nr of contigs/scaffolds included in scaffolding: '+ str(len(Scaffolds))#,Scaffolds.keys()
     
     for scaffold_ in Scaffolds:
         G.add_edge((scaffold_,'L'),(scaffold_,'R'),nr_links=None)    #this is a scaffold object but can be both a single contig or a scaffold.
     
     
     # Create the link edges in the graph by fetching info from bam file
     
     fishy_edges = defaultdict(int)
     for alignedread in bam_file:
         try: #check that read is aligned OBS: not with is_unmapped since this flag is fishy for e.g. BWA
             contig1=bam_file.getrname(alignedread.rname)
             contig2=bam_file.getrname(alignedread.mrnm)
         except ValueError:
             continue  
         if contig1 in Contigs and contig2 in Contigs:
             #TODO: this if-statement is an ad hoc implementation to deal with BWA's buggy SAM-flag reporting
             #if BWA fixes this -> remove this statement. If the links in fishy edges is equal to or ore than
             #the links in the graph G or G'. The edge will be removed.
             if alignedread.is_unmapped and alignedread.is_read1: # and contig1 != contig2: 
                 #Some BWA error in mappings can still slip through, these edges are caracterized by very few links                 
                 cont_obj1 = Contigs[contig1]
                 scaf_obj1 = Scaffolds[cont_obj1.scaffold]
                 cont_obj2 = Contigs[contig2]
                 scaf_obj2 = Scaffolds[cont_obj2.scaffold]
                 
                 if scaf_obj2.name != scaf_obj1.name:
                     (side1,side2) = CheckDir(cont_obj1,cont_obj2,alignedread) 
                     #get scaffold name for contig
                     s1 = Contigs[contig1].scaffold #if contig1 in Contigs else small_contigs[contig1].scaffold
                     s2 = Contigs[contig2].scaffold #if contig2 in Contigs else small_contigs[contig2].scaffold   
                     fishy_edges[((s1,side1),(s2,side2))] +=1
                     fishy_edges[((s2,side2),(s1,side1))] +=1
             
             #if contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold:
             if contig1 != contig2 and alignedread.is_read2 and not alignedread.is_unmapped and alignedread.mapq  > 20:
                 (read_dir,mate_dir) = (not alignedread.is_reverse,not alignedread.mate_is_reverse )
                 scaf1=Contigs[contig1].scaffold
                 scaf2=Contigs[contig2].scaffold                    
                 #Calculate actual position on scaffold here
                 #position1 cont/scaf1
                 cont_dir1 = Contigs[contig1].direction  #if pos : L if neg: R
                 cont1_pos = Contigs[contig1].position
                 readpos = alignedread.pos
                 cont1_len = Contigs[contig1].length
                 s1len = Scaffolds[scaf1].s_length
                 #position1 cont1/scaf1                        
                 cont_dir2 = Contigs[contig2].direction
                 cont2_pos = Contigs[contig2].position
                 matepos = alignedread.mpos
                 cont2_len = Contigs[contig2].length
                 s2len = Scaffolds[scaf2].s_length 
                 (obs,scaf_side1,scaf_side2)=PosDirCalculatorPE(cont_dir1,read_dir,cont1_pos,readpos,s1len,cont1_len,cont_dir2,mate_dir,cont2_pos,matepos,s2len,cont2_len,read_len) 
                 if obs < mean+ 6*std_dev: 
                     if (scaf2,scaf_side2) not in G[(scaf1,scaf_side1)]:
                         G.add_edge((scaf2,scaf_side2),(scaf1,scaf_side1),nr_links=1,gap_dist=[obs])
                     #print 'Added edge'
                     else:
                         G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['nr_links'] += 1
                         #print 'edge'
                         G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['gap_dist'].append(obs)                         
         
         elif contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold:
             ########################Use to validate scaffold herein previous step here
             pass
     RemoveBugEdges(G,fishy_edges)    
Example #8
0
def PROBetweenScaf(G_prime, Contigs, small_contigs, Scaffolds, small_scaffolds, param, dValuesTable, Information):
    start_scaf_index = param.scaffold_indexer
    G = nx.Graph()
    for node in G_prime:
        if node[0] in Scaffolds: # meets the length criteria
            G.add_node(node)

    # Filtering and heuristic here to reduce computation if needed O(n^2) in contigs on pathfinder

    #remove all solated contigs
    for node in G.nodes():
        if node in G:
            nbr = G_prime.neighbors(node)[0]
            if len(G_prime.neighbors(node)) == 1 and len(G_prime.neighbors(nbr)) == 1:
                G.remove_nodes_from([node, nbr])


    if len(G.nodes()) / 2.0 > 10000:
        # Too few short contigs compared to long (ratio set to 0.1) or lib ins size + 2*std_dev - 2*read_len < 200 ) and too many large contigs (> 10 000) do not enter path extension algm since to low payoff:

        if len(small_scaffolds) / float(len(Scaffolds)) < 0.1:
            print >> Information, "Did not enter path seartching algorithm between scaffolds due to too small fraction of small scaffolds, fraction were: ", len(small_scaffolds) / float(len(Scaffolds))
            return(start_scaf_index)

    ########### Find paths between scaffolds here ###############

    # Multi Processing (if available), check nr of available cores
    num_cores = multiprocessing.cpu_count()
    #TODO: If we get too many paths back and run into memory issues we could change so that only paths with score over 0 are stored in ELS module
    if param.multiprocess and num_cores > 1:
        import workerprocess
        import heapq
        print >> Information, 'Entering ELS.BetweenScaffolds parallelized with ', num_cores, ' cores.'
        start = time.time()
        # load up work queue
        work_queue = multiprocessing.Queue()
        end = set()
        for node in G:
            end.add(node)
        nodes = G.nodes()
        nr_jobs = len(nodes)
        chunk = nr_jobs / (num_cores)
        counter = 0
        nr_processes = 0
        # partition equally many nodes in G to each core
        while counter < nr_jobs:
            work_queue.put((set(nodes[counter:counter + chunk]), G_prime, end, param))
            nr_processes += 1
            print >> Information, 'node nr', counter, 'to', counter + chunk - 1, 'added'
            #print work_queue.get()
            counter += chunk

        # create a queue to pass to workers to store the results
        result_queue = multiprocessing.Queue()

        # spawn workers
        while not work_queue.empty():
            worker = workerprocess.Worker(work_queue.get(), result_queue)
            worker.start()

        # collect the results off the queue
        results = []
        for i in range(nr_processes):
            res = result_queue.get()
            results.append(res)

        def wrapper(func, args):
            return(func(*args))
        all_paths_sorted_wrt_score_itr = wrapper(heapq.merge, results) #tot_result
        all_paths_sorted_wrt_score = [i for i in all_paths_sorted_wrt_score_itr]
        elapsed = time.time() - start
        print >> Information, "Elapsed time multiprocessing: ", elapsed

    else:
        start = time.time()
        end = set()
        for node in G:
            end.add(node)
        iter_nodes = end.copy()
        print >> Information, 'Entering ELS.BetweenScaffolds single core'
        all_paths_sorted_wrt_score = ELS.BetweenScaffolds(G_prime, end, iter_nodes, param)
        elapsed = time.time() - start
        print >> Information, "Elapsed time single core pathfinder: ", elapsed

    ################################################################

    start_end_node_update_storage = {}
    print >> Information, 'Total number of paths between scaffolds detected:', len(all_paths_sorted_wrt_score)
    for sublist in reversed(all_paths_sorted_wrt_score):
        path = sublist[2]
        bad_links = sublist[1]
        score = sublist[0]
        path_len = sublist[3]
        print >> Information, 'Path: path length: {0}, nr bad links: {1}, score: {2} '.format((path_len - 2) / 2.0, bad_links, score)

        ## Need something here that keeps track on which contigs that are added to Scaffolds so that a
        ## contig is only present once in each path

        #print start_end_node_update_storage
        # Either a small contig/scaffold has been included in a path earlier and thus has moved it's object to Scaffolds (and changed index) 
        small_scaf_is_already_in = 0
        for scaf_ in path[1:-1]:
            if scaf_[0] not in small_scaffolds:
                small_scaf_is_already_in = 1
                #print 'At least one of the contigs is already in another scaffold'
                break
        if small_scaf_is_already_in:
            continue

        # A very special corner case (circular paths)
        if path[0][0] not in Scaffolds and path[-1][0] not in Scaffolds:
            try:
                strt = start_end_node_update_storage[path[0]][0]
                nd = start_end_node_update_storage[path[-1]][0]
                if strt[0] == nd[0]:
                    print >> Information, 'Rare case (circular paths) detected and treated. '
                    continue
            except KeyError:
                pass

        # Or a large scaffold/contig has changed scaffold index due to one of it's sides is present in another path (we still want to allow for paths from the other side)
        case1 = 0
        case2 = 0
        if path[0][0] not in Scaffolds:
            if path[0] in start_end_node_update_storage:
                case1 = 1
            else:
                print >> Information, 'Beginning is already in path'
                continue

        if path[-1][0] not in Scaffolds:
            if path[-1] in start_end_node_update_storage:
                case2 = 1
            else:
                print >> Information, 'End is already in path'
                continue


        original_start_node = path[0]

        if path[0][0] not in Scaffolds:
            #large scaffold has changed index before. This suggested path is however from it's other side
            node_to_remove1 = path[0]
            path[0] = start_end_node_update_storage[node_to_remove1][0]
            #update the node on the other end of the end scaffold to point at the newest index
            node_to_refresh1 = start_end_node_update_storage[node_to_remove1][1]
            #print 'Enter 1'
            try:
                node_ptr = start_end_node_update_storage[ path[-1] ][1]
                #print '1.1', node_ptr,start_end_node_update_storage[ path[-1] ]
            except KeyError:
                other_side = 'L' if path[-1][1] == 'R' else 'R'
                node_ptr = (path[-1][0], other_side)
                #print '1.2', node_ptr, path[-1]
            start_end_node_update_storage[node_to_refresh1] = [(param.scaffold_indexer + 1, 'L'), node_ptr  ]
            #path pointer can be accesed only once needs to be destroyed after
            del start_end_node_update_storage[node_to_remove1]


        if path[-1][0] not in Scaffolds:
            #large scaffold has changed index before. This suggested path is however from it's other side
            #print 'case2.2'
            node_to_remove2 = path[-1]
            path[-1] = start_end_node_update_storage[node_to_remove2][0]
            #update the node on the other end of the end scaffold to point at the newest index
            node_to_refresh2 = start_end_node_update_storage[node_to_remove2][1]
            #print 'Enter 2'
            try:
                node_ptr = start_end_node_update_storage[ original_start_node ][1]
                #print '2.1', node_ptr, start_end_node_update_storage[ original_start_node ]
            except KeyError:
                other_side = 'L' if original_start_node[1] == 'R' else 'R'
                node_ptr = (original_start_node[0], other_side)
                #print '2.2', node_ptr,original_start_node          
            start_end_node_update_storage[node_to_refresh2] = [(param.scaffold_indexer + 1, 'R'), node_ptr ]
            #path pointer can be accesed only once needs to be destroyed after
            del start_end_node_update_storage[node_to_remove2]


        # Here we update the contigs that lies in small_contigs to Contigs. We need to do this here because
        # we update the scaffold index below

        # move all contig and scaffold objects from "small" structure to large structure to fit with UpdateInfo structure

        small_scafs = map(lambda i: path[i], filter(lambda i: i % 2 == 1, range(len(path) - 1)))
        for item in small_scafs:
            scaf_obj = small_scaffolds[item[0]]
            Scaffolds[item[0]] = scaf_obj
            cont_objects = scaf_obj.contigs
            for obj_ in cont_objects:
                ctg_name = obj_.name
                Contigs[ctg_name] = obj_
                del small_contigs[ctg_name]
            del small_scaffolds[item[0]]
        ## Here we do the "joining of two scaffolds with the new path if no contig/scaffold is present
        ## in another path, we need to update "Scaffolds" structure here along as we go in order for
        ## the above dublette checking function to work

        #make the path a small linear graph
        G_ = nx.Graph()
#        if path[0][1] == 'L':
#            path.insert(0,(path[0][0],'R')) 
#        else: 
#            path.insert(0,(path[0][0],'L'))
#        if path[len(path)-1][1] == 'L':
#            path.insert(len(path),(path[len(path)-1][0],'R'))  
#        else:
#            path.insert(len(path),(path[len(path)-1][0],'L'))

        path.insert(0, (path[0][0], 'R')) if path[0][1] == 'L' else path.insert(0, (path[0][0], 'L'))
        path.insert(len(path), (path[-1][0], 'R'))  if path[-1][1] == 'L' else path.insert(len(path), (path[-1][0], 'L'))


        start_end_node_update_storage[path[0]] = 0
        start_end_node_update_storage[path[-1]] = 0
        G_.add_edges_from(zip(path[::1], path[1::]))

        for edge in G_.edges():
            try:
                G_[edge[0]][edge[1]]['nr_links'] = G_prime[edge[0]][edge[1]]['nr_links']
            except KeyError:
                print >> Information, path
                try:
                    Scaffolds[edge[0][0]]
                    print >> Information, edge[0][0] , 'is in Scaffolds'
                except KeyError:
                    print >> Information, edge[0][0] , 'is not in Scaffolds'
                try:
                    Scaffolds[edge[1][0]]
                    print >> Information, edge[1][0] , 'is in Scaffolds'
                except KeyError:
                    print >> Information, edge[1][0] , 'is not in Scaffolds'

                try:
                    small_scaffolds[edge[0][0]]
                    print >> Information, edge[0][0] , 'is in small_scaffolds'
                except KeyError:
                    print >> Information, edge[0][0] , 'is not in small_scaffolds'
                try:
                    small_scaffolds[edge[1][0]]
                    print >> Information, edge[1][0] , 'is in small_scaffolds'
                except KeyError:
                    print >> Information, edge[1][0] , 'is not in small_scaffolds'

                try:
                    G_prime[edge[0]]
                    print >> Information, edge[0] , 'is in G_prime'
                    print >> Information, G_prime[edge[0]]
                except KeyError:
                    print >> Information, edge[0] , 'is not in G_prime'
                try:
                    G_prime[edge[1]]
                    print >> Information, edge[1] , 'is in G_prime'
                    print >> Information, G_prime[edge[1]]
                except KeyError:
                    print >> Information, edge[1] , 'is not in G_prime'
                G_[edge[0]][edge[1]]['nr_links'] = G_prime[edge[0]][edge[1]]['nr_links']
                sys.exit()

            try:
                G_[edge[0]][edge[1]]['obs'] = G_prime[edge[0]][edge[1]]['obs']
            except KeyError:
                #may be the two different sides of a contig (has no gap dist)
                pass

        start = path[0]
        end = path[-1]
        prev_node = ('', '')
        pos = 0
        scaffold_length = 0
        contig_list = []
        param.scaffold_indexer += 1
        (G, contig_list, scaffold_length) = UpdateInfo(G_, Contigs, small_contigs, Scaffolds, small_scaffolds, start, prev_node, pos, contig_list, scaffold_length, dValuesTable, param)
        S = Scaffold.scaffold(param.scaffold_indexer, contig_list, scaffold_length, defaultdict(constant_large), defaultdict(constant_large), defaultdict(constant_small), defaultdict(constant_small))  #Create the new scaffold object 


        Scaffolds[S.name] = S        #include in scaffold library
        #add the new scaffold object to G_prime

        G_prime.add_node((S.name, 'L'))  #start node
        G_prime.add_node((S.name, 'R'))  # end node
        G_prime.add_edge((S.name, 'L'), (S.name, 'R'), nr_links=None)
        for nbr in G_prime.neighbors(start):
            nr_links_ = G_prime[start][nbr]['nr_links']
            if nr_links_:
                obs_ = G_prime[start][nbr]['obs']
                G_prime.add_edge((S.name, 'L'), nbr, nr_links=nr_links_, obs=obs_)

        for nbr in G_prime.neighbors(end):
            nr_links_ = G_prime[end][nbr]['nr_links']
            if nr_links_:
                obs_ = G_prime[end][nbr]['obs']
                G_prime.add_edge((S.name, 'R'), nbr, nr_links=nr_links_, obs=obs_)

        #remove the old scaffold objects from G_prime
        G_prime.remove_nodes_from(path)

        #updated beginning
        if case1 and not case2:
            start_end_node_update_storage[node_to_refresh1] = [(S.name, 'L'), path[-1] ]
            start_end_node_update_storage[path[-1]] = [(S.name, 'R'), node_to_refresh1 ]
        elif case2 and not case1:
            start_end_node_update_storage[path[0]] = [(S.name, 'L'), node_to_refresh2 ]
            start_end_node_update_storage[node_to_refresh2] = [(S.name, 'R'), path[0] ]
        elif case1 and case2:
            start_end_node_update_storage[node_to_refresh1] = [(S.name, 'L'), node_to_refresh2 ]
            start_end_node_update_storage[node_to_refresh2] = [(S.name, 'R'), node_to_refresh1 ]
        else:
            start_end_node_update_storage[path[0]] = [(S.name, 'L'), path[-1] ]
            start_end_node_update_storage[path[-1]] = [(S.name, 'R'), path[0] ]

    return(start_scaf_index)
def PE(Contigs, Scaffolds, F, Information, output_dest, C_dict, param):
    G = nx.Graph()
    print 'Parsing BAM file...'
    #informative_pair={81:(False,True),97:(True,False),113:(False,False),65:(True,True)}
    #I switched to look at mates instead since BWA can give false flag combinations for
    # read-mate when read is mapped but not mate eg 97-149 81-165. But the reverse
    #does not happen.
    #informative_pair={161:(True,False),145:(False,True),129:(True,True),177:(False,False)} #,131:(True,True),179:(False,False)} #147:(False,True),163:(True,False),
    with pysam.Samfile(
            param.bamfile, 'rb'
    ) as bam_file:  #once real data, change to 'rb', simulated files are on SAM format

        #Get parameters -r, -m, -s, -T, -t for library
        print 'Computing parameters not set by user...'
        GetParams(bam_file, param, Scaffolds, C_dict, F, Contigs)

        #Clean contig_library
        singeled_out = 0
        if param.first_lib:
            cont_lengths = bam_file.lengths
            cont_lengths = [int(nr) for nr in cont_lengths
                            ]  #convert long to int object
            cont_names = bam_file.references

            #Calculate NG50 and LG 50
            param.tot_assembly_length = sum(cont_lengths)
            sorted_lengths = sorted(cont_lengths, reverse=True)
            N50, L50 = CalculateStats(sorted_lengths, param)
            param.current_L50 = L50
            param.current_N50 = N50

            ####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING,
            ####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE
            for i in range(0, len(cont_names)):
                if cont_lengths[i] >= param.contig_threshold:
                    C = Contig.contig(cont_names[i])  # Create object contig
                    C.length = cont_lengths[i]
                    scaf_length = C.length  # Initially, scaffold consists of only this contig
                    C.direction = True  # always in same direction first, False=reverse
                    C.position = 0  #position always 0
                    C.links = {}
                    Contigs[
                        C.
                        name] = C  # Create a dict with name as key and the object container as value
                    S = Scaffold.scaffold(param.scaffold_indexer, [C],
                                          scaf_length, {},
                                          {})  # Create object scaffold
                    Scaffolds[S.name] = S
                    C.scaffold = S.name
                    param.scaffold_indexer += 1
                else:
                    singeled_out += 1
                    F.append([
                        (cont_names[i], True, 0, cont_lengths[i], {})
                    ])  #list of (contig_name, pos_direction, position,length)
            print >> Information, 'Nr of contigs that was singeled out due to length constraints ' + str(
                singeled_out)
        else:
            #Clean contig_library/scaffold_library
            scaf_lengths = [
                Scaffolds[scaffold_].s_length
                for scaffold_ in Scaffolds.keys()
            ]
            sorted_lengths = sorted(scaf_lengths, reverse=True)
            N50, L50 = CalculateStats(sorted_lengths, param)
            param.current_L50 = L50
            param.current_N50 = N50
            for scaffold_ in Scaffolds.keys(
            ):  #iterate over keys in hash, so that we can remove keys while iterating over it
                if Scaffolds[scaffold_].s_length < param.contig_threshold:
                    ###  Go to function and print to F
                    ### Remove Scaf_obj from Scaffolds and Contig_obj from contigs
                    S_obj = Scaffolds[scaffold_]
                    list_of_contigs = S_obj.contigs  #list of contig objects contained in scaffold object
                    Contigs, F = GO.WriteToF(
                        F, Contigs, list_of_contigs
                    )  #Don't worry, the contig objects are removed in WriteTOF function
                    del Scaffolds[scaffold_]
                    singeled_out += 1
            print >> Information, 'Nr of contigs/scaffolds that was singeled out due to length constraints ' + str(
                singeled_out)

        #Create "node graph" of contigs (that passed the length criteria). Each having a left and right node
        print 'Nr of contigs/scaffolds included in scaffolding: ' + str(
            len(Scaffolds))  #,Scaffolds.keys()
        if len(Scaffolds) == 0:
            return (None, Contigs, Scaffolds, F, param)
        cnt = 0
        tot_start = time()
        start1 = time()
        for scaffold_ in Scaffolds:
            G.add_edge(
                (scaffold_, 'L'), (scaffold_, 'R'), nr_links=None
            )  #this is a scaffold object but can be both a single contig or a scaffold.
            Scaffolds[scaffold_].scaffold_left_nbrs = {}
            Scaffolds[scaffold_].scaffold_right_nbrs = {}
            if cnt % 100000 == 0 and cnt > 0:
                elapsed = time() - start1
                print >> Information, 'Total nr of keys added: ', cnt, 'Time for adding last 100 000 keys: ', elapsed
                start1 = time()
            cnt += 1
        print 'Total time elapsed: ', time() - tot_start
        # Create the link edges in the graph by fetching info from bam file

        cont_aligned_len = {}
        for contig in Contigs:
            cont_aligned_len[contig] = [0, Contigs[contig].length]

        count = 0
        non_unique = 0
        non_unique_for_scaf = 0
        nr_of_duplicates = 0
        prev_obs1 = -1
        prev_obs2 = -1
        reads_with_too_long_insert = 0
        #fishy_reads = {}
        for alignedread in bam_file:
            try:  #check that read is aligned OBS: not with is_unmapped since this flag is fishy for e.g. BWA
                contig1 = bam_file.getrname(alignedread.rname)
                contig2 = bam_file.getrname(alignedread.mrnm)
            except ValueError:
                continue
            #contig1=bam_file.getrname(alignedread.rname)
            ## add to coverage computation if contig is still in the list of considered contigs
            try:
                cont_aligned_len[contig1][0] += alignedread.rlen
            except KeyError:
                pass
########## CREATE EDGES IN SCAFFOLD GRAPH ##########

            if contig1 != contig2 and alignedread.is_read2:
                #check how many non unique reads out of the useful ones (mapping to two different contigs)
                #This only works for BWA!! implement for other aligners as well
                if alignedread.mapq == 0:
                    non_unique += 1
                    #print contig1,contig2
                if contig1 in Contigs and contig2 in Contigs and Contigs[
                        contig2].scaffold != Contigs[
                            contig1].scaffold and alignedread.mapq > param.map_quality:  # and alignedread.tags[0][1] == 'U':
                    #if alignedread.tags[0][1] != 'U':
                    #    non_unique_for_scaf += 1
                    if alignedread.mapq == 0:
                        non_unique_for_scaf += 1
                    count += 1
                    #(read_dir,mate_dir)=informative_pair[flag_type]
                    (read_dir, mate_dir) = (not alignedread.is_reverse,
                                            not alignedread.mate_is_reverse)
                    scaf1 = Contigs[contig1].scaffold
                    scaf2 = Contigs[contig2].scaffold
                    #Calculate actual position on scaffold here
                    #position1 cont/scaf1
                    cont_dir1 = Contigs[
                        contig1].direction  #if pos : L if neg: R
                    cont1_pos = Contigs[contig1].position
                    readpos = alignedread.pos
                    cont1_len = Contigs[contig1].length
                    s1len = Scaffolds[scaf1].s_length
                    #position1 cont1/scaf1
                    cont_dir2 = Contigs[contig2].direction
                    cont2_pos = Contigs[contig2].position
                    matepos = alignedread.mpos
                    cont2_len = Contigs[contig2].length
                    s2len = Scaffolds[scaf2].s_length
                    (obs1, obs2, scaf_side1, scaf_side2) = PosDirCalculatorPE(
                        cont_dir1, read_dir, cont1_pos, readpos, s1len,
                        cont1_len, cont_dir2, mate_dir, cont2_pos, matepos,
                        s2len, cont2_len, param.read_len)
                    if obs1 == prev_obs1 and obs2 == prev_obs2:
                        nr_of_duplicates += 1
                        if param.detect_duplicate:
                            continue

                    if obs1 + obs2 < param.ins_size_threshold:
                        #                        if obs1 == 3 or obs2 ==3:
                        #                            print alignedread.pos,alignedread.mpos, contig1, contig2, scaf1, scaf2, s1len,s2len
                        if scaf_side1 == 'R':
                            if (scaf2, scaf_side2
                                ) in Scaffolds[scaf1].right_nbrs_obs:
                                if obs1 < Scaffolds[scaf1].right_nbrs_obs[(
                                        scaf2, scaf_side2)]:
                                    Scaffolds[scaf1].right_nbrs_obs[(
                                        scaf2, scaf_side2)] = obs1
                            else:
                                Scaffolds[scaf1].right_nbrs_obs[(
                                    scaf2, scaf_side2)] = obs1
                        if scaf_side1 == 'L':
                            if (scaf2, scaf_side2
                                ) in Scaffolds[scaf1].left_nbrs_obs:
                                if obs1 < Scaffolds[scaf1].left_nbrs_obs[(
                                        scaf2, scaf_side2)]:
                                    Scaffolds[scaf1].left_nbrs_obs[(
                                        scaf2, scaf_side2)] = obs1
                            else:
                                Scaffolds[scaf1].left_nbrs_obs[(
                                    scaf2, scaf_side2)] = obs1
                        if scaf_side2 == 'R':
                            if (scaf1, scaf_side1
                                ) in Scaffolds[scaf2].right_nbrs_obs:
                                if obs2 < Scaffolds[scaf2].right_nbrs_obs[(
                                        scaf1, scaf_side1)]:
                                    Scaffolds[scaf2].right_nbrs_obs[(
                                        scaf1, scaf_side1)] = obs2
                            else:
                                Scaffolds[scaf2].right_nbrs_obs[(
                                    scaf1, scaf_side1)] = obs2
                        if scaf_side2 == 'L':
                            if (scaf1, scaf_side1
                                ) in Scaffolds[scaf2].left_nbrs_obs:
                                if obs2 < Scaffolds[scaf2].left_nbrs_obs[(
                                        scaf1, scaf_side1)]:
                                    Scaffolds[scaf2].left_nbrs_obs[(
                                        scaf1, scaf_side1)] = obs2
                            else:
                                Scaffolds[scaf2].left_nbrs_obs[(
                                    scaf1, scaf_side1)] = obs2

                        if (scaf2, scaf_side2) not in G[(scaf1, scaf_side1)]:
                            G.add_edge((scaf2, scaf_side2),
                                       (scaf1, scaf_side1),
                                       nr_links=1,
                                       gap_dist=obs1 + obs2)
                        else:
                            G.edge[(scaf1,
                                    scaf_side1)][(scaf2,
                                                  scaf_side2)]['nr_links'] += 1
                            G.edge[(scaf1, scaf_side1)][(
                                scaf2, scaf_side2)]['gap_dist'] += obs1 + obs2
                    else:
                        reads_with_too_long_insert += 1
                        #fishy_reads[alignedread.qname[:-1]]=[contig2,alignedread.is_read2]
                        ## add to haplotype graph here!!

                    prev_obs1 = obs1
                    prev_obs2 = obs2

                elif contig1 in Contigs and contig2 in Contigs and Contigs[
                        contig2].scaffold != Contigs[contig1].scaffold:
                    ########################Use to validate scaffold in previous step here ############
                    pass


#        print 'NR OF FISHY EDGES: ', len(fishy_reads)
        print 'USEFUL READS (reads mapping to different contigs): ', count
        #print 'Non unique portion out of "USEFUL READS"  (filtered out from scaffolding): ', non_unique
        #print 'Non unique used for scaf: ', non_unique_for_scaf
        print 'Reads with too large insert size from "USEFUL READS" (filtered out): ', reads_with_too_long_insert
        if param.detect_duplicate:
            print 'Number of duplicated reads indicated and removed: ', nr_of_duplicates

    ##### Calc coverage for all contigs with current lib here #####
        sum_x = 0
        sum_x_sq = 0
        n = 0
        for contig in cont_aligned_len:
            cont_coverage = cont_aligned_len[contig][0] / float(
                cont_aligned_len[contig][1])
            #print key, cont_aligned_len[key]/float(cont_lengths[i])
            try:
                Contigs[contig].coverage = cont_coverage
            except KeyError:
                pass
            sum_x += cont_coverage
            sum_x_sq += cont_coverage**2
            n += 1

        mean_cov, std_dev_cov = CalculateMeanCoverage(Contigs, param.first_lib,
                                                      output_dest,
                                                      param.bamfile)
        param.mean_coverage = mean_cov
        param.std_dev_coverage = std_dev_cov

    return (G, Contigs, Scaffolds, F, param)
Example #10
0
    #print paths
    if len(paths) > 1:
        ScorePaths(G_prime, paths, all_paths, param)
        all_paths.sort(key=lambda list_: list_[0])

        if len(all_paths) > 0:
            return all_paths

    return []


if __name__ == '__main__':
    import Scaffold
    small_scaffolds_test = {}
    for i in range(1, 7):
        S = Scaffold.scaffold(i, 0, 0, {}, {})
        small_scaffolds_test[S.name] = S
    start = time()
    G_prime = nx.Graph()
    #G.add_nodes_from([(1, 'L'), (1, 'R'), (2, 'L'), (2, 'R'), (3, 'L'), (3, 'R'), (4, 'L'), (4, 'R'), (5, 'L'), (5, 'R')])
    for i in range(1, 7):
        G_prime.add_edge((i, 'L'), (i, 'R'), {'nr_links': 0})
    G_prime.add_edges_from([((1, 'R'), (2, 'R'), {
        'nr_links': 1
    }), ((3, 'L'), (4, 'L'), {
        'nr_links': 1
    }), ((2, 'L'), (3, 'R'), {
        'nr_links': 1
    }), ((1, 'R'), (5, 'L'), {
        'nr_links': 2
    }), ((5, 'R'), (4, 'L'), {
Example #11
0
def InitializeObjects(bam_file, Contigs, Scaffolds, param, Information,
                      G_prime, small_contigs, small_scaffolds, C_dict):
    singeled_out = 0
    contig_threshold = param.contig_threshold
    cont_lengths = bam_file.lengths
    cont_lengths = [int(nr)
                    for nr in cont_lengths]  #convert long to int object
    cont_names = bam_file.references

    #Calculate NG50 and LG 50
    param.tot_assembly_length = sum(cont_lengths)
    sorted_lengths = sorted(cont_lengths, reverse=True)
    N50, L50 = CalculateStats(sorted_lengths, [], param, Information)
    param.current_L50 = L50
    param.current_N50 = N50
    #extend_paths = param.extend_paths
    counter = 0
    start = time()
    for i in range(0, len(cont_names)):
        counter += 1
        if counter % 100000 == 0:
            print >> Information, 'Time adding 100k keys', time() - start
            start = time()
        if cont_names[i] not in C_dict:
            #errorhandle.unknown_contig(cont_names[i])
            continue

        if cont_lengths[i] >= contig_threshold:
            C = Contig.contig(cont_names[i])  # Create object contig
            C.length = cont_lengths[i]
            C.sequence = C_dict[cont_names[i]]
            del C_dict[cont_names[i]]
            scaf_length = C.length  # Initially, scaffold consists of only this contig
            C.direction = True  # always in same direction first, False=reverse
            C.position = 0  #position always 0
            #C.links = {}
            Contigs[
                C.
                name] = C  # Create a dict with name as key and the object container as value
            S = Scaffold.scaffold(param.scaffold_indexer, [C],
                                  scaf_length)  # Create object scaffold
            Scaffolds[S.name] = S
            C.scaffold = S.name
            param.scaffold_indexer += 1
        else:
            if cont_lengths[
                    i] > 0:  #In case of contigs with size 0 (due to some error in fasta file)
                C = Contig.contig(cont_names[i])  # Create object contig
                C.length = cont_lengths[i]
                C.sequence = C_dict[cont_names[i]]
                del C_dict[cont_names[i]]
                scaf_length = C.length  # Initially, scaffold consists of only this contig
                C.direction = True  # always in same direction first, False=reverse
                C.position = 0  #position always 0
                small_contigs[
                    C.
                    name] = C  # Create a dict with name as key and the object container as value
                S = Scaffold.scaffold(param.scaffold_indexer, [C],
                                      scaf_length)  # Create object scaffold
                small_scaffolds[S.name] = S
                C.scaffold = S.name
                param.scaffold_indexer += 1
                singeled_out += 1
    del C_dict

    print >> Information, 'Nr of contigs that was singeled out due to length constraints ' + str(
        singeled_out)
    return ()
Example #12
0
def PE(Contigs, Scaffolds, bamfile, mean, scaffold_indexer, F, read_len):
    G = nx.Graph()
    print 'Parsing BAM file...'
    #read_len=50
    #informative_pair={81:(False,True),97:(True,False),113:(False,False),65:(True,True)}
    #I switched to look at mates instead since BWA can give false flag combinations for
    # read-mate when read is mapped but not mate eg 97-149 81-165. But the reverse
    #does not happen.
    informative_pair = {
        161: (True, False),
        145: (False, True),
        129: (True, True),
        177: (False, False)
    }
    #threshold=800
    with pysam.Samfile(
            bamfile, 'r'
    ) as bam_file:  #once real data, change to 'rb', simulated files are on SAM format
        #Clean contig_library
        singeled_out = 0
        cont_lengths = bam_file.lengths
        cont_lengths = [int(nr)
                        for nr in cont_lengths]  #convert long to int object
        #print cont_lengths
        cont_names = bam_file.references
        ####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING,
        ####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE
        for i in range(0, len(cont_names)):
            C = Contig.contig(cont_names[i])  # Create object contig
            C.length = cont_lengths[i]
            C.scaf_length = C.length  # Initially, scaffold consists of only this contig
            C.direction = True  # always in same direction first, False=reverse
            C.position = 0  #position always 0
            C.links = {}
            Contigs[
                C.
                name] = C  # Create a dict with name as key and the object container as value
            S = Scaffold.scaffold('s' + str(scaffold_indexer), [C],
                                  C.length)  # Create object scaffold
            Scaffolds[S.name] = S
            C.scaffold = S.name
            G.add_node((S.name, 'L'), length=cont_lengths[i])
            G.add_node((S.name, 'R'), length=cont_lengths[i])
            scaffold_indexer += 1

        #Create "node graph" of contigs (that passed the length criteria). Each having a left and right node
        print 'Nr of contigs/scaffolds included in scaffolding: ' + str(
            len(Scaffolds))  #,Scaffolds.keys()

        for scaffold_ in Scaffolds:
            G.add_edge(
                (scaffold_, 'L'), (scaffold_, 'R'), nr_links=None
            )  #this is a scaffold object but can be both a single contig or a scaffold.

        # Create the link edges in the graph by fetching info from bam file

        for alignedread in bam_file:
            flag_type = alignedread.flag
            if flag_type in informative_pair:
                contig1 = bam_file.getrname(alignedread.rname)
                contig2 = bam_file.getrname(alignedread.mrnm)
                if contig1 in Contigs and contig2 in Contigs and Contigs[
                        contig2].scaffold != Contigs[contig1].scaffold:
                    (read_dir, mate_dir) = informative_pair[flag_type]
                    scaf1 = Contigs[contig1].scaffold
                    scaf2 = Contigs[contig2].scaffold
                    #Calculate actual position on scaffold here
                    #position1 cont/scaf1
                    cont_dir1 = Contigs[
                        contig1].direction  #if pos : L if neg: R
                    cont1_pos = Contigs[contig1].position
                    readpos = alignedread.pos
                    cont1_len = Contigs[contig1].length
                    s1len = Scaffolds[scaf1].s_length
                    #position1 cont1/scaf1
                    cont_dir2 = Contigs[contig2].direction
                    cont2_pos = Contigs[contig2].position
                    matepos = alignedread.mpos
                    cont2_len = Contigs[contig2].length
                    s2len = Scaffolds[scaf2].s_length
                    (gap, scaf_side1, scaf_side2) = PosDirCalculatorPE(
                        cont_dir1, read_dir, cont1_pos, readpos, s1len,
                        cont1_len, cont_dir2, mate_dir, cont2_pos, matepos,
                        s2len, cont2_len, read_len)
                    if (scaf2, scaf_side2) not in G[(scaf1, scaf_side1)]:
                        G.add_edge((scaf2, scaf_side2), (scaf1, scaf_side1),
                                   nr_links=1,
                                   gap_dist=[gap])
                        #print 'Added edge'
                    else:
                        G.edge[(scaf1,
                                scaf_side1)][(scaf2,
                                              scaf_side2)]['nr_links'] += 1
                        #print 'edge'
                        G.edge[(scaf1, scaf_side1)][(
                            scaf2, scaf_side2)]['gap_dist'].append(gap)

                elif contig1 in Contigs and contig2 in Contigs and Contigs[
                        contig2].scaffold != Contigs[contig1].scaffold:
                    ########################Use to validate scaffold herein previous step here
                    pass
    #for edge in G.edges():
    #    if G[edge[0]][edge[1]]['nr_reads']:
    #        print G[edge[0]][edge[1]]['gap_dist']

    #print G.edges(data=True)
    return (G, Contigs, Scaffolds, F, scaffold_indexer)
Example #13
0
def PE(Contigs, Scaffolds, F, Information, output_dest, C_dict, param):
    G = nx.Graph()
    print 'Parsing BAM file...'
    #informative_pair={81:(False,True),97:(True,False),113:(False,False),65:(True,True)}
    #I switched to look at mates instead since BWA can give false flag combinations for
    # read-mate when read is mapped but not mate eg 97-149 81-165. But the reverse
    #does not happen.
    #informative_pair={161:(True,False),145:(False,True),129:(True,True),177:(False,False)} #,131:(True,True),179:(False,False)} #147:(False,True),163:(True,False),
    with pysam.Samfile(param.bamfile, 'rb') as bam_file:    #once real data, change to 'rb', simulated files are on SAM format

        #Get parameters -r, -m, -s, -T, -t for library
        print 'Computing parameters not set by user...'
        GetParams(bam_file, param, Scaffolds, C_dict, F, Contigs)

        #Clean contig_library
        singeled_out = 0
        if param.first_lib:
            cont_lengths = bam_file.lengths
            cont_lengths = [int(nr) for nr in cont_lengths]  #convert long to int object
            cont_names = bam_file.references

            #Calculate NG50 and LG 50
            param.tot_assembly_length = sum(cont_lengths)
            sorted_lengths = sorted(cont_lengths, reverse=True)
            N50, L50 = CalculateStats(sorted_lengths, param)
            param.current_L50 = L50
            param.current_N50 = N50


####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING, 
####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE
            for i in range(0, len(cont_names)):
                if cont_lengths[i] >= param.contig_threshold:
                    C = Contig.contig(cont_names[i])   # Create object contig
                    C.length = cont_lengths[i]
                    scaf_length = C.length        # Initially, scaffold consists of only this contig    
                    C.direction = True              # always in same direction first, False=reverse
                    C.position = 0                  #position always 0
                    C.links = {}
                    Contigs[C.name] = C              # Create a dict with name as key and the object container as value
                    S = Scaffold.scaffold(param.scaffold_indexer, [C], scaf_length, {}, {})  # Create object scaffold
                    Scaffolds[S.name] = S
                    C.scaffold = S.name
                    param.scaffold_indexer += 1
                else:
                    singeled_out += 1
                    F.append([(cont_names[i], True, 0, cont_lengths[i], {})])   #list of (contig_name, pos_direction, position,length)
            print >> Information, 'Nr of contigs that was singeled out due to length constraints ' + str(singeled_out)
        else:
                #Clean contig_library/scaffold_library
            scaf_lengths = [Scaffolds[scaffold_].s_length for scaffold_ in Scaffolds.keys()]
            sorted_lengths = sorted(scaf_lengths, reverse=True)
            N50, L50 = CalculateStats(sorted_lengths, param)
            param.current_L50 = L50
            param.current_N50 = N50
            for scaffold_ in Scaffolds.keys(): #iterate over keys in hash, so that we can remove keys while iterating over it
                if Scaffolds[scaffold_].s_length < param.contig_threshold:
                    ###  Go to function and print to F
                    ### Remove Scaf_obj from Scaffolds and Contig_obj from contigs
                    S_obj = Scaffolds[scaffold_]
                    list_of_contigs = S_obj.contigs   #list of contig objects contained in scaffold object
                    Contigs, F = GO.WriteToF(F, Contigs, list_of_contigs)  #Don't worry, the contig objects are removed in WriteTOF function
                    del Scaffolds[scaffold_]
                    singeled_out += 1
            print >> Information, 'Nr of contigs/scaffolds that was singeled out due to length constraints ' + str(singeled_out)


        #Create "node graph" of contigs (that passed the length criteria). Each having a left and right node
        print 'Nr of contigs/scaffolds included in scaffolding: ' + str(len(Scaffolds))#,Scaffolds.keys()
        if len(Scaffolds) == 0:
            return(None, Contigs, Scaffolds, F, param)
        cnt = 0
        tot_start = time()
        start1 = time()
        for scaffold_ in Scaffolds:
            G.add_edge((scaffold_, 'L'), (scaffold_, 'R'), nr_links=None)    #this is a scaffold object but can be both a single contig or a scaffold.
            Scaffolds[ scaffold_ ].scaffold_left_nbrs = {}
            Scaffolds[ scaffold_ ].scaffold_right_nbrs = {}
            if cnt % 100000 == 0 and cnt > 0:
                elapsed = time() - start1
                print >> Information, 'Total nr of keys added: ', cnt, 'Time for adding last 100 000 keys: ', elapsed
                start1 = time()
            cnt += 1
        print 'Total time elapsed: ', time() - tot_start
        # Create the link edges in the graph by fetching info from bam file

        cont_aligned_len = {}
        for contig in Contigs:
            cont_aligned_len[contig] = [0, Contigs[contig].length]

        count = 0
        non_unique = 0
        non_unique_for_scaf = 0
        nr_of_duplicates = 0
        prev_obs1 = -1
        prev_obs2 = -1
        reads_with_too_long_insert = 0
        #fishy_reads = {}
        for alignedread in bam_file:
            try: #check that read is aligned OBS: not with is_unmapped since this flag is fishy for e.g. BWA
                contig1 = bam_file.getrname(alignedread.rname)
                contig2 = bam_file.getrname(alignedread.mrnm)
            except ValueError:
                continue
            #contig1=bam_file.getrname(alignedread.rname)
            ## add to coverage computation if contig is still in the list of considered contigs
            try:
                cont_aligned_len[contig1][0] += alignedread.rlen
            except KeyError:
                pass
########## CREATE EDGES IN SCAFFOLD GRAPH ##########

            if contig1 != contig2 and alignedread.is_read2:
                #check how many non unique reads out of the useful ones (mapping to two different contigs)
                #This only works for BWA!! implement for other aligners as well
                if alignedread.mapq == 0:
                    non_unique += 1
                    #print contig1,contig2
                if contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold and alignedread.mapq > param.map_quality: # and alignedread.tags[0][1] == 'U':
                    #if alignedread.tags[0][1] != 'U':
                    #    non_unique_for_scaf += 1
                    if alignedread.mapq == 0:
                        non_unique_for_scaf += 1
                    count += 1
                    #(read_dir,mate_dir)=informative_pair[flag_type]
                    (read_dir, mate_dir) = (not alignedread.is_reverse, not alignedread.mate_is_reverse)
                    scaf1 = Contigs[contig1].scaffold
                    scaf2 = Contigs[contig2].scaffold
                    #Calculate actual position on scaffold here
                    #position1 cont/scaf1
                    cont_dir1 = Contigs[contig1].direction  #if pos : L if neg: R
                    cont1_pos = Contigs[contig1].position
                    readpos = alignedread.pos
                    cont1_len = Contigs[contig1].length
                    s1len = Scaffolds[scaf1].s_length
                    #position1 cont1/scaf1                        
                    cont_dir2 = Contigs[contig2].direction
                    cont2_pos = Contigs[contig2].position
                    matepos = alignedread.mpos
                    cont2_len = Contigs[contig2].length
                    s2len = Scaffolds[scaf2].s_length
                    (obs1, obs2, scaf_side1, scaf_side2) = PosDirCalculatorPE(cont_dir1, read_dir, cont1_pos, readpos, s1len, cont1_len, cont_dir2, mate_dir, cont2_pos, matepos, s2len, cont2_len, param.read_len)
                    if obs1 == prev_obs1 and obs2 == prev_obs2:
                        nr_of_duplicates += 1
                        if param.detect_duplicate:
                            continue

                    if obs1 + obs2 < param.ins_size_threshold:
#                        if obs1 == 3 or obs2 ==3:
#                            print alignedread.pos,alignedread.mpos, contig1, contig2, scaf1, scaf2, s1len,s2len
                        if scaf_side1 == 'R':
                            if (scaf2, scaf_side2) in Scaffolds[scaf1].right_nbrs_obs:
                                if obs1 < Scaffolds[scaf1].right_nbrs_obs[(scaf2, scaf_side2)]:
                                    Scaffolds[scaf1].right_nbrs_obs[(scaf2, scaf_side2)] = obs1
                            else:
                                Scaffolds[scaf1].right_nbrs_obs[(scaf2, scaf_side2)] = obs1
                        if scaf_side1 == 'L':
                            if (scaf2, scaf_side2) in Scaffolds[scaf1].left_nbrs_obs:
                                if obs1 < Scaffolds[scaf1].left_nbrs_obs[(scaf2, scaf_side2)]:
                                    Scaffolds[scaf1].left_nbrs_obs[(scaf2, scaf_side2)] = obs1
                            else:
                                Scaffolds[scaf1].left_nbrs_obs[(scaf2, scaf_side2)] = obs1
                        if scaf_side2 == 'R':
                            if (scaf1, scaf_side1) in Scaffolds[scaf2].right_nbrs_obs:
                                if obs2 < Scaffolds[scaf2].right_nbrs_obs[(scaf1, scaf_side1)]:
                                    Scaffolds[scaf2].right_nbrs_obs[(scaf1, scaf_side1)] = obs2
                            else:
                                Scaffolds[scaf2].right_nbrs_obs[(scaf1, scaf_side1)] = obs2
                        if scaf_side2 == 'L':
                            if (scaf1, scaf_side1) in Scaffolds[scaf2].left_nbrs_obs:
                                if obs2 < Scaffolds[scaf2].left_nbrs_obs[(scaf1, scaf_side1)]:
                                    Scaffolds[scaf2].left_nbrs_obs[(scaf1, scaf_side1)] = obs2
                            else:
                                Scaffolds[scaf2].left_nbrs_obs[(scaf1, scaf_side1)] = obs2

                        if (scaf2, scaf_side2) not in G[(scaf1, scaf_side1)]:
                            G.add_edge((scaf2, scaf_side2), (scaf1, scaf_side1), nr_links=1, gap_dist=obs1 + obs2)
                        else:
                            G.edge[(scaf1, scaf_side1)][(scaf2, scaf_side2)]['nr_links'] += 1
                            G.edge[(scaf1, scaf_side1)][(scaf2, scaf_side2)]['gap_dist'] += obs1 + obs2
                    else:
                        reads_with_too_long_insert += 1
                        #fishy_reads[alignedread.qname[:-1]]=[contig2,alignedread.is_read2]
                        ## add to haplotype graph here!!

                    prev_obs1 = obs1
                    prev_obs2 = obs2

                elif contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold:
########################Use to validate scaffold in previous step here ############
                    pass
#        print 'NR OF FISHY EDGES: ', len(fishy_reads)
        print 'USEFUL READS (reads mapping to different contigs): ', count
    #print 'Non unique portion out of "USEFUL READS"  (filtered out from scaffolding): ', non_unique
        #print 'Non unique used for scaf: ', non_unique_for_scaf
        print 'Reads with too large insert size from "USEFUL READS" (filtered out): ', reads_with_too_long_insert
        if param.detect_duplicate:
            print 'Number of duplicated reads indicated and removed: ', nr_of_duplicates

    ##### Calc coverage for all contigs with current lib here #####
        sum_x = 0
        sum_x_sq = 0
        n = 0
        for contig in cont_aligned_len:
            cont_coverage = cont_aligned_len[contig][0] / float(cont_aligned_len[contig][1])
                #print key, cont_aligned_len[key]/float(cont_lengths[i])
            try:
                Contigs[contig].coverage = cont_coverage
            except KeyError:
                pass
            sum_x += cont_coverage
            sum_x_sq += cont_coverage ** 2
            n += 1

        mean_cov, std_dev_cov = CalculateMeanCoverage(Contigs, param.first_lib, output_dest, param.bamfile)
        param.mean_coverage = mean_cov
        param.std_dev_coverage = std_dev_cov


    return(G, Contigs, Scaffolds, F, param)
Example #14
0
    def AddEdges(Contigs,Scaffolds,bamfile,mean,std_dev,scaffold_indexer,F,read_len):
        #Clean contig_library
        bam_object = BamParser(bamfile)
        singeled_out=0
        cont_lengths= bam_object.bam_file.lengths
        cont_lengths=[int(nr) for nr in cont_lengths]  #convert long to int object
        #print cont_lengths
        cont_names = bam_object.bam_file.references
        ####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING, 
        ####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE
        for i in range(0,len(cont_names)):
            if cont_lengths[i] >= 300:
                C=Contig.contig(cont_names[i])   # Create object contig
                C.length = cont_lengths[i]
                C.scaf_length = C.length        # Initially, scaffold consists of only this contig
                C.direction = True              # always in same direction first, False=reverse
                C.position = 0                  #position always 0
                C.links = {}
                Contigs[C.name] = C              # Create a dict with name as key and the object container as value
                S=Scaffold.scaffold('s'+str(scaffold_indexer),[C],C.length)  # Create object scaffold
                Scaffolds[S.name]=S
                C.scaffold=S.name
                G.add_node((S.name,'L'),length=cont_lengths[i])
                G.add_node((S.name,'R'),length=cont_lengths[i])
                scaffold_indexer+=1
        
        
        #Create "node graph" of contigs (that passed the length criteria). Each having a left and right node
        #print 'Nr of contigs/scaffolds included in scaffolding: '+ str(len(Scaffolds))#,Scaffolds.keys()
        
        for scaffold_ in Scaffolds:
            G.add_edge((scaffold_,'L'),(scaffold_,'R'),nr_links=None)    #this is a scaffold object but can be both a single contig or a scaffold.
        
        
        # Create the link edges in the graph by fetching info from bam file


        def nr_softclipps(read):
            max_soft = 0
            for type_,length in read.cigar:
                if type_ == 4 and length >= max_soft:
                    max_soft = length
            return max_soft

        global_max_softclipps = 0
        global_min_obs = 100000 
        links_used = 0
        #r_len = float(read_len)
        for read1,read2 in bam_object.unique_reads_on_different_references():
            contig1=bam_object.bam_file.getrname(read1.rname)
            contig2=bam_object.bam_file.getrname(read2.rname)
            max_soft_readpair = max(nr_softclipps(read1),nr_softclipps(read2))
            if max_soft_readpair > global_max_softclipps:
                global_max_softclipps = max_soft_readpair
            # print read1.cigar
            #if read1.qlen/r_len < 0.7 or read2.qlen/r_len < 0.7:
            #    continue
            #     print 'midddle1',o1, o1+o2, read1.pos, read1.mapq,read1.qlen,read1.rlen, read1.cigar, read1.tags
            # if read2.qlen < 50:
            #     print 'midddle2',o2, o1+o2, read2.pos, read2.mapq, read2.qlen,read2.rlen, read2.cigar, read2.tags
            if contig1 in Contigs and contig2 in Contigs:                
                (read_dir,mate_dir) = (not read1.is_reverse,not read2.is_reverse )
                scaf1=Contigs[contig1].scaffold
                scaf2=Contigs[contig2].scaffold                    
                #Calculate actual position on scaffold here
                #position1 cont/scaf1
                cont_dir1 = Contigs[contig1].direction  #if pos : L if neg: R
                cont1_pos = Contigs[contig1].position
                readpos = read1.pos
                cont1_len = Contigs[contig1].length
                s1len = Scaffolds[scaf1].s_length
                #position1 cont1/scaf1                        
                cont_dir2 = Contigs[contig2].direction
                cont2_pos = Contigs[contig2].position
                matepos = read2.pos
                cont2_len = Contigs[contig2].length
                s2len = Scaffolds[scaf2].s_length 
                (obs,scaf_side1,scaf_side2, (o1,o2))=PosDirCalculatorPE(cont_dir1,read_dir,cont1_pos,readpos,s1len,cont1_len,cont_dir2,mate_dir,cont2_pos,matepos,s2len,cont2_len,read_len) 
                if obs < mean+ 4*std_dev: 
                    links_used += 1
                    if (scaf2,scaf_side2) not in G[(scaf1,scaf_side1)]:
                        G.add_edge((scaf2,scaf_side2),(scaf1,scaf_side1),nr_links=1,gap_dist=[obs],obs_pos=set() )
                        G[(scaf2,scaf_side2)][(scaf1,scaf_side1)]['obs_pos'].add((o1,o2))
                        if o1 < global_min_obs:
                            global_min_obs = o1
                        if o2 < global_min_obs:
                            global_min_obs = o2 
                    #print 'Added edge'
                    else:
                        try:
                            if (o1,o2) in G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['obs_pos']:
                                continue
                        except KeyError:
                            #print G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]
                            continue

                        # if (o1,o2) in G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['obs_pos']:
                        #     #print 'detected duplicate'
                        #     continue
                        else:
                            G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['nr_links'] += 1
                            G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['gap_dist'].append(obs)  
                            G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['obs_pos'].add((o1,o2))  
                            G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['obs_pos'].add((o2,o1))  

                            if o1 < global_min_obs:
                                global_min_obs = o1
                            if o2 < global_min_obs:
                                global_min_obs = o2
                            # if o1 < 50:
                            #     print o1, o1+o2, read1.pos, read1.mapq,read1.qlen,read1.rlen, read1.cigar, read1.tags
                            #     #print fancy_str(read1)
                            # if o2 < 50:
                            #     print o2, o1+o2, read2.pos, read2.mapq, read2.qlen,read2.rlen, read2.cigar, read2.tags
                            #     #print fancy_str(read2)                                
                                

        print 'Max softclipps:', global_max_softclipps
        print 'Min obs:', global_min_obs
        # sys.exit()
        #print 'Nr links used:', links_used
        return global_max_softclipps
Example #15
0
def PE(Contigs, Scaffolds, bamfile, mean, scaffold_indexer, F, read_len):
    G = nx.Graph()
    print "Parsing BAM file..."
    # read_len=50
    # informative_pair={81:(False,True),97:(True,False),113:(False,False),65:(True,True)}
    # I switched to look at mates instead since BWA can give false flag combinations for
    # read-mate when read is mapped but not mate eg 97-149 81-165. But the reverse
    # does not happen.
    informative_pair = {161: (True, False), 145: (False, True), 129: (True, True), 177: (False, False)}
    # threshold=800
    with pysam.Samfile(bamfile, "r") as bam_file:  # once real data, change to 'rb', simulated files are on SAM format
        # Clean contig_library
        singeled_out = 0
        cont_lengths = bam_file.lengths
        cont_lengths = [int(nr) for nr in cont_lengths]  # convert long to int object
        # print cont_lengths
        cont_names = bam_file.references
        ####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING,
        ####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE
        for i in range(0, len(cont_names)):
            C = Contig.contig(cont_names[i])  # Create object contig
            C.length = cont_lengths[i]
            C.scaf_length = C.length  # Initially, scaffold consists of only this contig
            C.direction = True  # always in same direction first, False=reverse
            C.position = 0  # position always 0
            C.links = {}
            Contigs[C.name] = C  # Create a dict with name as key and the object container as value
            S = Scaffold.scaffold("s" + str(scaffold_indexer), [C], C.length)  # Create object scaffold
            Scaffolds[S.name] = S
            C.scaffold = S.name
            G.add_node((S.name, "L"), length=cont_lengths[i])
            G.add_node((S.name, "R"), length=cont_lengths[i])
            scaffold_indexer += 1

        # Create "node graph" of contigs (that passed the length criteria). Each having a left and right node
        print "Nr of contigs/scaffolds included in scaffolding: " + str(len(Scaffolds))  # ,Scaffolds.keys()

        for scaffold_ in Scaffolds:
            G.add_edge(
                (scaffold_, "L"), (scaffold_, "R"), nr_links=None
            )  # this is a scaffold object but can be both a single contig or a scaffold.

        # Create the link edges in the graph by fetching info from bam file

        for alignedread in bam_file:
            flag_type = alignedread.flag
            if flag_type in informative_pair:
                contig1 = bam_file.getrname(alignedread.rname)
                contig2 = bam_file.getrname(alignedread.mrnm)
                if contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold:
                    (read_dir, mate_dir) = informative_pair[flag_type]
                    scaf1 = Contigs[contig1].scaffold
                    scaf2 = Contigs[contig2].scaffold
                    # Calculate actual position on scaffold here
                    # position1 cont/scaf1
                    cont_dir1 = Contigs[contig1].direction  # if pos : L if neg: R
                    cont1_pos = Contigs[contig1].position
                    readpos = alignedread.pos
                    cont1_len = Contigs[contig1].length
                    s1len = Scaffolds[scaf1].s_length
                    # position1 cont1/scaf1
                    cont_dir2 = Contigs[contig2].direction
                    cont2_pos = Contigs[contig2].position
                    matepos = alignedread.mpos
                    cont2_len = Contigs[contig2].length
                    s2len = Scaffolds[scaf2].s_length
                    (gap, scaf_side1, scaf_side2) = PosDirCalculatorPE(
                        cont_dir1,
                        read_dir,
                        cont1_pos,
                        readpos,
                        s1len,
                        cont1_len,
                        cont_dir2,
                        mate_dir,
                        cont2_pos,
                        matepos,
                        s2len,
                        cont2_len,
                        read_len,
                    )
                    if (scaf2, scaf_side2) not in G[(scaf1, scaf_side1)]:
                        G.add_edge((scaf2, scaf_side2), (scaf1, scaf_side1), nr_links=1, gap_dist=[gap])
                        # print 'Added edge'
                    else:
                        G.edge[(scaf1, scaf_side1)][(scaf2, scaf_side2)]["nr_links"] += 1
                        # print 'edge'
                        G.edge[(scaf1, scaf_side1)][(scaf2, scaf_side2)]["gap_dist"].append(gap)

                elif (
                    contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold
                ):
                    ########################Use to validate scaffold herein previous step here
                    pass
    # for edge in G.edges():
    #    if G[edge[0]][edge[1]]['nr_reads']:
    #        print G[edge[0]][edge[1]]['gap_dist']

    # print G.edges(data=True)
    return (G, Contigs, Scaffolds, F, scaffold_indexer)