コード例 #1
0
ファイル: CreateGraph.py プロジェクト: sidorov-si/BESST
def InitializeObjects(
    bam_file, Contigs, Scaffolds, param, Information, G_prime, small_contigs, small_scaffolds, C_dict
):
    singeled_out = 0
    contig_threshold = param.contig_threshold
    cont_lengths = bam_file.lengths
    cont_lengths = [int(nr) for nr in cont_lengths]  # convert long to int object
    cont_names = bam_file.references

    # Calculate NG50 and LG 50
    param.tot_assembly_length = sum(cont_lengths)
    sorted_lengths = sorted(cont_lengths, reverse=True)
    N50, L50 = CalculateStats(sorted_lengths, [], param, Information)
    param.current_L50 = L50
    param.current_N50 = N50
    # extend_paths = param.extend_paths
    counter = 0
    start = time()
    for i in range(0, len(cont_names)):
        counter += 1
        if counter % 100000 == 0:
            print >> Information, "Time adding 100k keys", time() - start
            start = time()
        if cont_names[i] not in C_dict:
            # errorhandle.unknown_contig(cont_names[i])
            continue

        if cont_lengths[i] >= contig_threshold:
            C = Contig.contig(cont_names[i])  # Create object contig
            C.length = cont_lengths[i]
            C.sequence = C_dict[cont_names[i]]
            del C_dict[cont_names[i]]
            scaf_length = C.length  # Initially, scaffold consists of only this contig
            C.direction = True  # always in same direction first, False=reverse
            C.position = 0  # position always 0
            # C.links = {}
            Contigs[C.name] = C  # Create a dict with name as key and the object container as value
            S = Scaffold.scaffold(param.scaffold_indexer, [C], scaf_length)  # Create object scaffold
            Scaffolds[S.name] = S
            C.scaffold = S.name
            param.scaffold_indexer += 1
        else:
            if cont_lengths[i] > 0:  # In case of contigs with size 0 (due to some error in fasta file)
                C = Contig.contig(cont_names[i])  # Create object contig
                C.length = cont_lengths[i]
                C.sequence = C_dict[cont_names[i]]
                del C_dict[cont_names[i]]
                scaf_length = C.length  # Initially, scaffold consists of only this contig
                C.direction = True  # always in same direction first, False=reverse
                C.position = 0  # position always 0
                small_contigs[C.name] = C  # Create a dict with name as key and the object container as value
                S = Scaffold.scaffold(param.scaffold_indexer, [C], scaf_length)  # Create object scaffold
                small_scaffolds[S.name] = S
                C.scaffold = S.name
                param.scaffold_indexer += 1
                singeled_out += 1
    del C_dict

    print >> Information, "Nr of contigs that was singeled out due to length constraints " + str(singeled_out)
    return ()
コード例 #2
0
def make_contigs(SAG_fasta_file, contig_name_tsv, names_map):
    
    # uses SAG fasta file to load contig sequences
    # and mapped name/tsv files to update names
    # for contigs that were simplified using anvi-script
    
    SAG_fa_file = open(SAG_fasta_file)
    contig_names = open(contig_name_tsv)
    names_map_file = open(names_map)

    if SAG_fasta_file[7:10] == contig_name_tsv[:3]: #checkfilenames
        print 'filenames checked out'
        
        contig_list = [] #output list for contigs
        
        SAG_fa_lines = SAG_fa_file.readlines()
        contig_name_lines = contig_names.readlines()

        for i in range(0, len(SAG_fa_lines), 2): #make contigs from fasta file
            
            ID = SAG_fa_lines[i][1 : (len(SAG_fa_lines[i]) - 1)]
            sequence = SAG_fa_lines[i + 1]
            SAG = contig_name_tsv[:3]
            contig = Contig(ID, sequence, SAG)
            contig_list.append(contig)
            
        #update contig names from simplified versions
        #anvi-script -> SAG.fasta names
        for line in contig_name_lines:
            line = line.split()
            old_ID = line[0]
            new_ID = line[1]
            for i in range(len(contig_list)):
                if contig_list[i].update_ID(old_ID, new_ID):
                    break
        
        #SAG.fasta names -> JGI contig names
        names_map_lines = names_map_file.readlines()
        for line in names_map_lines:
            line = line.split()
            old_ID = line[0]
            new_ID = line[1]
            for i in range(len(contig_list)):
                if contig_list[i].update_ID(old_ID, new_ID):
                    break
    else:
        print "The SAG FASTA file and  contig name .tsv file do not match!"
    
    SAG_fa_file.close()
    contig_names.close()
    names_map_file.close()
    
    #print contig_list
    return contig_list
コード例 #3
0
 def AddEdges(Contigs,Scaffolds,bamfile,mean,std_dev,scaffold_indexer,F,read_len):
     #Clean contig_library
     singeled_out=0
     cont_lengths= bam_file.lengths
     cont_lengths=[int(nr) for nr in cont_lengths]  #convert long to int object
     #print cont_lengths
     cont_names = bam_file.references
     ####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING, 
     ####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE
     for i in range(0,len(cont_names)):
         if cont_lengths[i] >= 300:
             C=Contig.contig(cont_names[i])   # Create object contig
             C.length = cont_lengths[i]
             C.scaf_length = C.length        # Initially, scaffold consists of only this contig
             C.direction = True              # always in same direction first, False=reverse
             C.position = 0                  #position always 0
             C.links = {}
             Contigs[C.name] = C              # Create a dict with name as key and the object container as value
             S=Scaffold.scaffold('s'+str(scaffold_indexer),[C],C.length)  # Create object scaffold
             Scaffolds[S.name]=S
             C.scaffold=S.name
             G.add_node((S.name,'L'),length=cont_lengths[i])
             G.add_node((S.name,'R'),length=cont_lengths[i])
             scaffold_indexer+=1
     
     
     #Create "node graph" of contigs (that passed the length criteria). Each having a left and right node
     #print 'Nr of contigs/scaffolds included in scaffolding: '+ str(len(Scaffolds))#,Scaffolds.keys()
     
     for scaffold_ in Scaffolds:
         G.add_edge((scaffold_,'L'),(scaffold_,'R'),nr_links=None)    #this is a scaffold object but can be both a single contig or a scaffold.
     
     
     # Create the link edges in the graph by fetching info from bam file
     
     fishy_edges = defaultdict(int)
     for alignedread in bam_file:
         try: #check that read is aligned OBS: not with is_unmapped since this flag is fishy for e.g. BWA
             contig1=bam_file.getrname(alignedread.rname)
             contig2=bam_file.getrname(alignedread.mrnm)
         except ValueError:
             continue  
         if contig1 in Contigs and contig2 in Contigs:
             #TODO: this if-statement is an ad hoc implementation to deal with BWA's buggy SAM-flag reporting
             #if BWA fixes this -> remove this statement. If the links in fishy edges is equal to or ore than
             #the links in the graph G or G'. The edge will be removed.
             if alignedread.is_unmapped and alignedread.is_read1: # and contig1 != contig2: 
                 #Some BWA error in mappings can still slip through, these edges are caracterized by very few links                 
                 cont_obj1 = Contigs[contig1]
                 scaf_obj1 = Scaffolds[cont_obj1.scaffold]
                 cont_obj2 = Contigs[contig2]
                 scaf_obj2 = Scaffolds[cont_obj2.scaffold]
                 
                 if scaf_obj2.name != scaf_obj1.name:
                     (side1,side2) = CheckDir(cont_obj1,cont_obj2,alignedread) 
                     #get scaffold name for contig
                     s1 = Contigs[contig1].scaffold #if contig1 in Contigs else small_contigs[contig1].scaffold
                     s2 = Contigs[contig2].scaffold #if contig2 in Contigs else small_contigs[contig2].scaffold   
                     fishy_edges[((s1,side1),(s2,side2))] +=1
                     fishy_edges[((s2,side2),(s1,side1))] +=1
             
             #if contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold:
             if contig1 != contig2 and alignedread.is_read2 and not alignedread.is_unmapped and alignedread.mapq  > 20:
                 (read_dir,mate_dir) = (not alignedread.is_reverse,not alignedread.mate_is_reverse )
                 scaf1=Contigs[contig1].scaffold
                 scaf2=Contigs[contig2].scaffold                    
                 #Calculate actual position on scaffold here
                 #position1 cont/scaf1
                 cont_dir1 = Contigs[contig1].direction  #if pos : L if neg: R
                 cont1_pos = Contigs[contig1].position
                 readpos = alignedread.pos
                 cont1_len = Contigs[contig1].length
                 s1len = Scaffolds[scaf1].s_length
                 #position1 cont1/scaf1                        
                 cont_dir2 = Contigs[contig2].direction
                 cont2_pos = Contigs[contig2].position
                 matepos = alignedread.mpos
                 cont2_len = Contigs[contig2].length
                 s2len = Scaffolds[scaf2].s_length 
                 (obs,scaf_side1,scaf_side2)=PosDirCalculatorPE(cont_dir1,read_dir,cont1_pos,readpos,s1len,cont1_len,cont_dir2,mate_dir,cont2_pos,matepos,s2len,cont2_len,read_len) 
                 if obs < mean+ 6*std_dev: 
                     if (scaf2,scaf_side2) not in G[(scaf1,scaf_side1)]:
                         G.add_edge((scaf2,scaf_side2),(scaf1,scaf_side1),nr_links=1,gap_dist=[obs])
                     #print 'Added edge'
                     else:
                         G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['nr_links'] += 1
                         #print 'edge'
                         G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['gap_dist'].append(obs)                         
         
         elif contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold:
             ########################Use to validate scaffold herein previous step here
             pass
     RemoveBugEdges(G,fishy_edges)    
コード例 #4
0
def PE(Contigs, Scaffolds, F, Information, output_dest, C_dict, param):
    G = nx.Graph()
    print 'Parsing BAM file...'
    #informative_pair={81:(False,True),97:(True,False),113:(False,False),65:(True,True)}
    #I switched to look at mates instead since BWA can give false flag combinations for
    # read-mate when read is mapped but not mate eg 97-149 81-165. But the reverse
    #does not happen.
    #informative_pair={161:(True,False),145:(False,True),129:(True,True),177:(False,False)} #,131:(True,True),179:(False,False)} #147:(False,True),163:(True,False),
    with pysam.Samfile(
            param.bamfile, 'rb'
    ) as bam_file:  #once real data, change to 'rb', simulated files are on SAM format

        #Get parameters -r, -m, -s, -T, -t for library
        print 'Computing parameters not set by user...'
        GetParams(bam_file, param, Scaffolds, C_dict, F, Contigs)

        #Clean contig_library
        singeled_out = 0
        if param.first_lib:
            cont_lengths = bam_file.lengths
            cont_lengths = [int(nr) for nr in cont_lengths
                            ]  #convert long to int object
            cont_names = bam_file.references

            #Calculate NG50 and LG 50
            param.tot_assembly_length = sum(cont_lengths)
            sorted_lengths = sorted(cont_lengths, reverse=True)
            N50, L50 = CalculateStats(sorted_lengths, param)
            param.current_L50 = L50
            param.current_N50 = N50

            ####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING,
            ####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE
            for i in range(0, len(cont_names)):
                if cont_lengths[i] >= param.contig_threshold:
                    C = Contig.contig(cont_names[i])  # Create object contig
                    C.length = cont_lengths[i]
                    scaf_length = C.length  # Initially, scaffold consists of only this contig
                    C.direction = True  # always in same direction first, False=reverse
                    C.position = 0  #position always 0
                    C.links = {}
                    Contigs[
                        C.
                        name] = C  # Create a dict with name as key and the object container as value
                    S = Scaffold.scaffold(param.scaffold_indexer, [C],
                                          scaf_length, {},
                                          {})  # Create object scaffold
                    Scaffolds[S.name] = S
                    C.scaffold = S.name
                    param.scaffold_indexer += 1
                else:
                    singeled_out += 1
                    F.append([
                        (cont_names[i], True, 0, cont_lengths[i], {})
                    ])  #list of (contig_name, pos_direction, position,length)
            print >> Information, 'Nr of contigs that was singeled out due to length constraints ' + str(
                singeled_out)
        else:
            #Clean contig_library/scaffold_library
            scaf_lengths = [
                Scaffolds[scaffold_].s_length
                for scaffold_ in Scaffolds.keys()
            ]
            sorted_lengths = sorted(scaf_lengths, reverse=True)
            N50, L50 = CalculateStats(sorted_lengths, param)
            param.current_L50 = L50
            param.current_N50 = N50
            for scaffold_ in Scaffolds.keys(
            ):  #iterate over keys in hash, so that we can remove keys while iterating over it
                if Scaffolds[scaffold_].s_length < param.contig_threshold:
                    ###  Go to function and print to F
                    ### Remove Scaf_obj from Scaffolds and Contig_obj from contigs
                    S_obj = Scaffolds[scaffold_]
                    list_of_contigs = S_obj.contigs  #list of contig objects contained in scaffold object
                    Contigs, F = GO.WriteToF(
                        F, Contigs, list_of_contigs
                    )  #Don't worry, the contig objects are removed in WriteTOF function
                    del Scaffolds[scaffold_]
                    singeled_out += 1
            print >> Information, 'Nr of contigs/scaffolds that was singeled out due to length constraints ' + str(
                singeled_out)

        #Create "node graph" of contigs (that passed the length criteria). Each having a left and right node
        print 'Nr of contigs/scaffolds included in scaffolding: ' + str(
            len(Scaffolds))  #,Scaffolds.keys()
        if len(Scaffolds) == 0:
            return (None, Contigs, Scaffolds, F, param)
        cnt = 0
        tot_start = time()
        start1 = time()
        for scaffold_ in Scaffolds:
            G.add_edge(
                (scaffold_, 'L'), (scaffold_, 'R'), nr_links=None
            )  #this is a scaffold object but can be both a single contig or a scaffold.
            Scaffolds[scaffold_].scaffold_left_nbrs = {}
            Scaffolds[scaffold_].scaffold_right_nbrs = {}
            if cnt % 100000 == 0 and cnt > 0:
                elapsed = time() - start1
                print >> Information, 'Total nr of keys added: ', cnt, 'Time for adding last 100 000 keys: ', elapsed
                start1 = time()
            cnt += 1
        print 'Total time elapsed: ', time() - tot_start
        # Create the link edges in the graph by fetching info from bam file

        cont_aligned_len = {}
        for contig in Contigs:
            cont_aligned_len[contig] = [0, Contigs[contig].length]

        count = 0
        non_unique = 0
        non_unique_for_scaf = 0
        nr_of_duplicates = 0
        prev_obs1 = -1
        prev_obs2 = -1
        reads_with_too_long_insert = 0
        #fishy_reads = {}
        for alignedread in bam_file:
            try:  #check that read is aligned OBS: not with is_unmapped since this flag is fishy for e.g. BWA
                contig1 = bam_file.getrname(alignedread.rname)
                contig2 = bam_file.getrname(alignedread.mrnm)
            except ValueError:
                continue
            #contig1=bam_file.getrname(alignedread.rname)
            ## add to coverage computation if contig is still in the list of considered contigs
            try:
                cont_aligned_len[contig1][0] += alignedread.rlen
            except KeyError:
                pass
########## CREATE EDGES IN SCAFFOLD GRAPH ##########

            if contig1 != contig2 and alignedread.is_read2:
                #check how many non unique reads out of the useful ones (mapping to two different contigs)
                #This only works for BWA!! implement for other aligners as well
                if alignedread.mapq == 0:
                    non_unique += 1
                    #print contig1,contig2
                if contig1 in Contigs and contig2 in Contigs and Contigs[
                        contig2].scaffold != Contigs[
                            contig1].scaffold and alignedread.mapq > param.map_quality:  # and alignedread.tags[0][1] == 'U':
                    #if alignedread.tags[0][1] != 'U':
                    #    non_unique_for_scaf += 1
                    if alignedread.mapq == 0:
                        non_unique_for_scaf += 1
                    count += 1
                    #(read_dir,mate_dir)=informative_pair[flag_type]
                    (read_dir, mate_dir) = (not alignedread.is_reverse,
                                            not alignedread.mate_is_reverse)
                    scaf1 = Contigs[contig1].scaffold
                    scaf2 = Contigs[contig2].scaffold
                    #Calculate actual position on scaffold here
                    #position1 cont/scaf1
                    cont_dir1 = Contigs[
                        contig1].direction  #if pos : L if neg: R
                    cont1_pos = Contigs[contig1].position
                    readpos = alignedread.pos
                    cont1_len = Contigs[contig1].length
                    s1len = Scaffolds[scaf1].s_length
                    #position1 cont1/scaf1
                    cont_dir2 = Contigs[contig2].direction
                    cont2_pos = Contigs[contig2].position
                    matepos = alignedread.mpos
                    cont2_len = Contigs[contig2].length
                    s2len = Scaffolds[scaf2].s_length
                    (obs1, obs2, scaf_side1, scaf_side2) = PosDirCalculatorPE(
                        cont_dir1, read_dir, cont1_pos, readpos, s1len,
                        cont1_len, cont_dir2, mate_dir, cont2_pos, matepos,
                        s2len, cont2_len, param.read_len)
                    if obs1 == prev_obs1 and obs2 == prev_obs2:
                        nr_of_duplicates += 1
                        if param.detect_duplicate:
                            continue

                    if obs1 + obs2 < param.ins_size_threshold:
                        #                        if obs1 == 3 or obs2 ==3:
                        #                            print alignedread.pos,alignedread.mpos, contig1, contig2, scaf1, scaf2, s1len,s2len
                        if scaf_side1 == 'R':
                            if (scaf2, scaf_side2
                                ) in Scaffolds[scaf1].right_nbrs_obs:
                                if obs1 < Scaffolds[scaf1].right_nbrs_obs[(
                                        scaf2, scaf_side2)]:
                                    Scaffolds[scaf1].right_nbrs_obs[(
                                        scaf2, scaf_side2)] = obs1
                            else:
                                Scaffolds[scaf1].right_nbrs_obs[(
                                    scaf2, scaf_side2)] = obs1
                        if scaf_side1 == 'L':
                            if (scaf2, scaf_side2
                                ) in Scaffolds[scaf1].left_nbrs_obs:
                                if obs1 < Scaffolds[scaf1].left_nbrs_obs[(
                                        scaf2, scaf_side2)]:
                                    Scaffolds[scaf1].left_nbrs_obs[(
                                        scaf2, scaf_side2)] = obs1
                            else:
                                Scaffolds[scaf1].left_nbrs_obs[(
                                    scaf2, scaf_side2)] = obs1
                        if scaf_side2 == 'R':
                            if (scaf1, scaf_side1
                                ) in Scaffolds[scaf2].right_nbrs_obs:
                                if obs2 < Scaffolds[scaf2].right_nbrs_obs[(
                                        scaf1, scaf_side1)]:
                                    Scaffolds[scaf2].right_nbrs_obs[(
                                        scaf1, scaf_side1)] = obs2
                            else:
                                Scaffolds[scaf2].right_nbrs_obs[(
                                    scaf1, scaf_side1)] = obs2
                        if scaf_side2 == 'L':
                            if (scaf1, scaf_side1
                                ) in Scaffolds[scaf2].left_nbrs_obs:
                                if obs2 < Scaffolds[scaf2].left_nbrs_obs[(
                                        scaf1, scaf_side1)]:
                                    Scaffolds[scaf2].left_nbrs_obs[(
                                        scaf1, scaf_side1)] = obs2
                            else:
                                Scaffolds[scaf2].left_nbrs_obs[(
                                    scaf1, scaf_side1)] = obs2

                        if (scaf2, scaf_side2) not in G[(scaf1, scaf_side1)]:
                            G.add_edge((scaf2, scaf_side2),
                                       (scaf1, scaf_side1),
                                       nr_links=1,
                                       gap_dist=obs1 + obs2)
                        else:
                            G.edge[(scaf1,
                                    scaf_side1)][(scaf2,
                                                  scaf_side2)]['nr_links'] += 1
                            G.edge[(scaf1, scaf_side1)][(
                                scaf2, scaf_side2)]['gap_dist'] += obs1 + obs2
                    else:
                        reads_with_too_long_insert += 1
                        #fishy_reads[alignedread.qname[:-1]]=[contig2,alignedread.is_read2]
                        ## add to haplotype graph here!!

                    prev_obs1 = obs1
                    prev_obs2 = obs2

                elif contig1 in Contigs and contig2 in Contigs and Contigs[
                        contig2].scaffold != Contigs[contig1].scaffold:
                    ########################Use to validate scaffold in previous step here ############
                    pass


#        print 'NR OF FISHY EDGES: ', len(fishy_reads)
        print 'USEFUL READS (reads mapping to different contigs): ', count
        #print 'Non unique portion out of "USEFUL READS"  (filtered out from scaffolding): ', non_unique
        #print 'Non unique used for scaf: ', non_unique_for_scaf
        print 'Reads with too large insert size from "USEFUL READS" (filtered out): ', reads_with_too_long_insert
        if param.detect_duplicate:
            print 'Number of duplicated reads indicated and removed: ', nr_of_duplicates

    ##### Calc coverage for all contigs with current lib here #####
        sum_x = 0
        sum_x_sq = 0
        n = 0
        for contig in cont_aligned_len:
            cont_coverage = cont_aligned_len[contig][0] / float(
                cont_aligned_len[contig][1])
            #print key, cont_aligned_len[key]/float(cont_lengths[i])
            try:
                Contigs[contig].coverage = cont_coverage
            except KeyError:
                pass
            sum_x += cont_coverage
            sum_x_sq += cont_coverage**2
            n += 1

        mean_cov, std_dev_cov = CalculateMeanCoverage(Contigs, param.first_lib,
                                                      output_dest,
                                                      param.bamfile)
        param.mean_coverage = mean_cov
        param.std_dev_coverage = std_dev_cov

    return (G, Contigs, Scaffolds, F, param)
コード例 #5
0
def InitializeObjects(bam_file, Contigs, Scaffolds, param, Information,
                      G_prime, small_contigs, small_scaffolds, C_dict):
    singeled_out = 0
    contig_threshold = param.contig_threshold
    cont_lengths = bam_file.lengths
    cont_lengths = [int(nr)
                    for nr in cont_lengths]  #convert long to int object
    cont_names = bam_file.references

    #Calculate NG50 and LG 50
    param.tot_assembly_length = sum(cont_lengths)
    sorted_lengths = sorted(cont_lengths, reverse=True)
    N50, L50 = CalculateStats(sorted_lengths, [], param, Information)
    param.current_L50 = L50
    param.current_N50 = N50
    #extend_paths = param.extend_paths
    counter = 0
    start = time()
    for i in range(0, len(cont_names)):
        counter += 1
        if counter % 100000 == 0:
            print >> Information, 'Time adding 100k keys', time() - start
            start = time()
        if cont_names[i] not in C_dict:
            #errorhandle.unknown_contig(cont_names[i])
            continue

        if cont_lengths[i] >= contig_threshold:
            C = Contig.contig(cont_names[i])  # Create object contig
            C.length = cont_lengths[i]
            C.sequence = C_dict[cont_names[i]]
            del C_dict[cont_names[i]]
            scaf_length = C.length  # Initially, scaffold consists of only this contig
            C.direction = True  # always in same direction first, False=reverse
            C.position = 0  #position always 0
            #C.links = {}
            Contigs[
                C.
                name] = C  # Create a dict with name as key and the object container as value
            S = Scaffold.scaffold(param.scaffold_indexer, [C],
                                  scaf_length)  # Create object scaffold
            Scaffolds[S.name] = S
            C.scaffold = S.name
            param.scaffold_indexer += 1
        else:
            if cont_lengths[
                    i] > 0:  #In case of contigs with size 0 (due to some error in fasta file)
                C = Contig.contig(cont_names[i])  # Create object contig
                C.length = cont_lengths[i]
                C.sequence = C_dict[cont_names[i]]
                del C_dict[cont_names[i]]
                scaf_length = C.length  # Initially, scaffold consists of only this contig
                C.direction = True  # always in same direction first, False=reverse
                C.position = 0  #position always 0
                small_contigs[
                    C.
                    name] = C  # Create a dict with name as key and the object container as value
                S = Scaffold.scaffold(param.scaffold_indexer, [C],
                                      scaf_length)  # Create object scaffold
                small_scaffolds[S.name] = S
                C.scaffold = S.name
                param.scaffold_indexer += 1
                singeled_out += 1
    del C_dict

    print >> Information, 'Nr of contigs that was singeled out due to length constraints ' + str(
        singeled_out)
    return ()
コード例 #6
0
ファイル: CreateGraph.py プロジェクト: Freire/gapest
def PE(Contigs, Scaffolds, bamfile, mean, scaffold_indexer, F, read_len):
    G = nx.Graph()
    print 'Parsing BAM file...'
    #read_len=50
    #informative_pair={81:(False,True),97:(True,False),113:(False,False),65:(True,True)}
    #I switched to look at mates instead since BWA can give false flag combinations for
    # read-mate when read is mapped but not mate eg 97-149 81-165. But the reverse
    #does not happen.
    informative_pair = {
        161: (True, False),
        145: (False, True),
        129: (True, True),
        177: (False, False)
    }
    #threshold=800
    with pysam.Samfile(
            bamfile, 'r'
    ) as bam_file:  #once real data, change to 'rb', simulated files are on SAM format
        #Clean contig_library
        singeled_out = 0
        cont_lengths = bam_file.lengths
        cont_lengths = [int(nr)
                        for nr in cont_lengths]  #convert long to int object
        #print cont_lengths
        cont_names = bam_file.references
        ####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING,
        ####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE
        for i in range(0, len(cont_names)):
            C = Contig.contig(cont_names[i])  # Create object contig
            C.length = cont_lengths[i]
            C.scaf_length = C.length  # Initially, scaffold consists of only this contig
            C.direction = True  # always in same direction first, False=reverse
            C.position = 0  #position always 0
            C.links = {}
            Contigs[
                C.
                name] = C  # Create a dict with name as key and the object container as value
            S = Scaffold.scaffold('s' + str(scaffold_indexer), [C],
                                  C.length)  # Create object scaffold
            Scaffolds[S.name] = S
            C.scaffold = S.name
            G.add_node((S.name, 'L'), length=cont_lengths[i])
            G.add_node((S.name, 'R'), length=cont_lengths[i])
            scaffold_indexer += 1

        #Create "node graph" of contigs (that passed the length criteria). Each having a left and right node
        print 'Nr of contigs/scaffolds included in scaffolding: ' + str(
            len(Scaffolds))  #,Scaffolds.keys()

        for scaffold_ in Scaffolds:
            G.add_edge(
                (scaffold_, 'L'), (scaffold_, 'R'), nr_links=None
            )  #this is a scaffold object but can be both a single contig or a scaffold.

        # Create the link edges in the graph by fetching info from bam file

        for alignedread in bam_file:
            flag_type = alignedread.flag
            if flag_type in informative_pair:
                contig1 = bam_file.getrname(alignedread.rname)
                contig2 = bam_file.getrname(alignedread.mrnm)
                if contig1 in Contigs and contig2 in Contigs and Contigs[
                        contig2].scaffold != Contigs[contig1].scaffold:
                    (read_dir, mate_dir) = informative_pair[flag_type]
                    scaf1 = Contigs[contig1].scaffold
                    scaf2 = Contigs[contig2].scaffold
                    #Calculate actual position on scaffold here
                    #position1 cont/scaf1
                    cont_dir1 = Contigs[
                        contig1].direction  #if pos : L if neg: R
                    cont1_pos = Contigs[contig1].position
                    readpos = alignedread.pos
                    cont1_len = Contigs[contig1].length
                    s1len = Scaffolds[scaf1].s_length
                    #position1 cont1/scaf1
                    cont_dir2 = Contigs[contig2].direction
                    cont2_pos = Contigs[contig2].position
                    matepos = alignedread.mpos
                    cont2_len = Contigs[contig2].length
                    s2len = Scaffolds[scaf2].s_length
                    (gap, scaf_side1, scaf_side2) = PosDirCalculatorPE(
                        cont_dir1, read_dir, cont1_pos, readpos, s1len,
                        cont1_len, cont_dir2, mate_dir, cont2_pos, matepos,
                        s2len, cont2_len, read_len)
                    if (scaf2, scaf_side2) not in G[(scaf1, scaf_side1)]:
                        G.add_edge((scaf2, scaf_side2), (scaf1, scaf_side1),
                                   nr_links=1,
                                   gap_dist=[gap])
                        #print 'Added edge'
                    else:
                        G.edge[(scaf1,
                                scaf_side1)][(scaf2,
                                              scaf_side2)]['nr_links'] += 1
                        #print 'edge'
                        G.edge[(scaf1, scaf_side1)][(
                            scaf2, scaf_side2)]['gap_dist'].append(gap)

                elif contig1 in Contigs and contig2 in Contigs and Contigs[
                        contig2].scaffold != Contigs[contig1].scaffold:
                    ########################Use to validate scaffold herein previous step here
                    pass
    #for edge in G.edges():
    #    if G[edge[0]][edge[1]]['nr_reads']:
    #        print G[edge[0]][edge[1]]['gap_dist']

    #print G.edges(data=True)
    return (G, Contigs, Scaffolds, F, scaffold_indexer)
コード例 #7
0
ファイル: CreateGraph.py プロジェクト: ksahlin/BESST_RNA
def PE(Contigs, Scaffolds, F, Information, output_dest, C_dict, param):
    G = nx.Graph()
    print 'Parsing BAM file...'
    #informative_pair={81:(False,True),97:(True,False),113:(False,False),65:(True,True)}
    #I switched to look at mates instead since BWA can give false flag combinations for
    # read-mate when read is mapped but not mate eg 97-149 81-165. But the reverse
    #does not happen.
    #informative_pair={161:(True,False),145:(False,True),129:(True,True),177:(False,False)} #,131:(True,True),179:(False,False)} #147:(False,True),163:(True,False),
    with pysam.Samfile(param.bamfile, 'rb') as bam_file:    #once real data, change to 'rb', simulated files are on SAM format

        #Get parameters -r, -m, -s, -T, -t for library
        print 'Computing parameters not set by user...'
        GetParams(bam_file, param, Scaffolds, C_dict, F, Contigs)

        #Clean contig_library
        singeled_out = 0
        if param.first_lib:
            cont_lengths = bam_file.lengths
            cont_lengths = [int(nr) for nr in cont_lengths]  #convert long to int object
            cont_names = bam_file.references

            #Calculate NG50 and LG 50
            param.tot_assembly_length = sum(cont_lengths)
            sorted_lengths = sorted(cont_lengths, reverse=True)
            N50, L50 = CalculateStats(sorted_lengths, param)
            param.current_L50 = L50
            param.current_N50 = N50


####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING, 
####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE
            for i in range(0, len(cont_names)):
                if cont_lengths[i] >= param.contig_threshold:
                    C = Contig.contig(cont_names[i])   # Create object contig
                    C.length = cont_lengths[i]
                    scaf_length = C.length        # Initially, scaffold consists of only this contig    
                    C.direction = True              # always in same direction first, False=reverse
                    C.position = 0                  #position always 0
                    C.links = {}
                    Contigs[C.name] = C              # Create a dict with name as key and the object container as value
                    S = Scaffold.scaffold(param.scaffold_indexer, [C], scaf_length, {}, {})  # Create object scaffold
                    Scaffolds[S.name] = S
                    C.scaffold = S.name
                    param.scaffold_indexer += 1
                else:
                    singeled_out += 1
                    F.append([(cont_names[i], True, 0, cont_lengths[i], {})])   #list of (contig_name, pos_direction, position,length)
            print >> Information, 'Nr of contigs that was singeled out due to length constraints ' + str(singeled_out)
        else:
                #Clean contig_library/scaffold_library
            scaf_lengths = [Scaffolds[scaffold_].s_length for scaffold_ in Scaffolds.keys()]
            sorted_lengths = sorted(scaf_lengths, reverse=True)
            N50, L50 = CalculateStats(sorted_lengths, param)
            param.current_L50 = L50
            param.current_N50 = N50
            for scaffold_ in Scaffolds.keys(): #iterate over keys in hash, so that we can remove keys while iterating over it
                if Scaffolds[scaffold_].s_length < param.contig_threshold:
                    ###  Go to function and print to F
                    ### Remove Scaf_obj from Scaffolds and Contig_obj from contigs
                    S_obj = Scaffolds[scaffold_]
                    list_of_contigs = S_obj.contigs   #list of contig objects contained in scaffold object
                    Contigs, F = GO.WriteToF(F, Contigs, list_of_contigs)  #Don't worry, the contig objects are removed in WriteTOF function
                    del Scaffolds[scaffold_]
                    singeled_out += 1
            print >> Information, 'Nr of contigs/scaffolds that was singeled out due to length constraints ' + str(singeled_out)


        #Create "node graph" of contigs (that passed the length criteria). Each having a left and right node
        print 'Nr of contigs/scaffolds included in scaffolding: ' + str(len(Scaffolds))#,Scaffolds.keys()
        if len(Scaffolds) == 0:
            return(None, Contigs, Scaffolds, F, param)
        cnt = 0
        tot_start = time()
        start1 = time()
        for scaffold_ in Scaffolds:
            G.add_edge((scaffold_, 'L'), (scaffold_, 'R'), nr_links=None)    #this is a scaffold object but can be both a single contig or a scaffold.
            Scaffolds[ scaffold_ ].scaffold_left_nbrs = {}
            Scaffolds[ scaffold_ ].scaffold_right_nbrs = {}
            if cnt % 100000 == 0 and cnt > 0:
                elapsed = time() - start1
                print >> Information, 'Total nr of keys added: ', cnt, 'Time for adding last 100 000 keys: ', elapsed
                start1 = time()
            cnt += 1
        print 'Total time elapsed: ', time() - tot_start
        # Create the link edges in the graph by fetching info from bam file

        cont_aligned_len = {}
        for contig in Contigs:
            cont_aligned_len[contig] = [0, Contigs[contig].length]

        count = 0
        non_unique = 0
        non_unique_for_scaf = 0
        nr_of_duplicates = 0
        prev_obs1 = -1
        prev_obs2 = -1
        reads_with_too_long_insert = 0
        #fishy_reads = {}
        for alignedread in bam_file:
            try: #check that read is aligned OBS: not with is_unmapped since this flag is fishy for e.g. BWA
                contig1 = bam_file.getrname(alignedread.rname)
                contig2 = bam_file.getrname(alignedread.mrnm)
            except ValueError:
                continue
            #contig1=bam_file.getrname(alignedread.rname)
            ## add to coverage computation if contig is still in the list of considered contigs
            try:
                cont_aligned_len[contig1][0] += alignedread.rlen
            except KeyError:
                pass
########## CREATE EDGES IN SCAFFOLD GRAPH ##########

            if contig1 != contig2 and alignedread.is_read2:
                #check how many non unique reads out of the useful ones (mapping to two different contigs)
                #This only works for BWA!! implement for other aligners as well
                if alignedread.mapq == 0:
                    non_unique += 1
                    #print contig1,contig2
                if contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold and alignedread.mapq > param.map_quality: # and alignedread.tags[0][1] == 'U':
                    #if alignedread.tags[0][1] != 'U':
                    #    non_unique_for_scaf += 1
                    if alignedread.mapq == 0:
                        non_unique_for_scaf += 1
                    count += 1
                    #(read_dir,mate_dir)=informative_pair[flag_type]
                    (read_dir, mate_dir) = (not alignedread.is_reverse, not alignedread.mate_is_reverse)
                    scaf1 = Contigs[contig1].scaffold
                    scaf2 = Contigs[contig2].scaffold
                    #Calculate actual position on scaffold here
                    #position1 cont/scaf1
                    cont_dir1 = Contigs[contig1].direction  #if pos : L if neg: R
                    cont1_pos = Contigs[contig1].position
                    readpos = alignedread.pos
                    cont1_len = Contigs[contig1].length
                    s1len = Scaffolds[scaf1].s_length
                    #position1 cont1/scaf1                        
                    cont_dir2 = Contigs[contig2].direction
                    cont2_pos = Contigs[contig2].position
                    matepos = alignedread.mpos
                    cont2_len = Contigs[contig2].length
                    s2len = Scaffolds[scaf2].s_length
                    (obs1, obs2, scaf_side1, scaf_side2) = PosDirCalculatorPE(cont_dir1, read_dir, cont1_pos, readpos, s1len, cont1_len, cont_dir2, mate_dir, cont2_pos, matepos, s2len, cont2_len, param.read_len)
                    if obs1 == prev_obs1 and obs2 == prev_obs2:
                        nr_of_duplicates += 1
                        if param.detect_duplicate:
                            continue

                    if obs1 + obs2 < param.ins_size_threshold:
#                        if obs1 == 3 or obs2 ==3:
#                            print alignedread.pos,alignedread.mpos, contig1, contig2, scaf1, scaf2, s1len,s2len
                        if scaf_side1 == 'R':
                            if (scaf2, scaf_side2) in Scaffolds[scaf1].right_nbrs_obs:
                                if obs1 < Scaffolds[scaf1].right_nbrs_obs[(scaf2, scaf_side2)]:
                                    Scaffolds[scaf1].right_nbrs_obs[(scaf2, scaf_side2)] = obs1
                            else:
                                Scaffolds[scaf1].right_nbrs_obs[(scaf2, scaf_side2)] = obs1
                        if scaf_side1 == 'L':
                            if (scaf2, scaf_side2) in Scaffolds[scaf1].left_nbrs_obs:
                                if obs1 < Scaffolds[scaf1].left_nbrs_obs[(scaf2, scaf_side2)]:
                                    Scaffolds[scaf1].left_nbrs_obs[(scaf2, scaf_side2)] = obs1
                            else:
                                Scaffolds[scaf1].left_nbrs_obs[(scaf2, scaf_side2)] = obs1
                        if scaf_side2 == 'R':
                            if (scaf1, scaf_side1) in Scaffolds[scaf2].right_nbrs_obs:
                                if obs2 < Scaffolds[scaf2].right_nbrs_obs[(scaf1, scaf_side1)]:
                                    Scaffolds[scaf2].right_nbrs_obs[(scaf1, scaf_side1)] = obs2
                            else:
                                Scaffolds[scaf2].right_nbrs_obs[(scaf1, scaf_side1)] = obs2
                        if scaf_side2 == 'L':
                            if (scaf1, scaf_side1) in Scaffolds[scaf2].left_nbrs_obs:
                                if obs2 < Scaffolds[scaf2].left_nbrs_obs[(scaf1, scaf_side1)]:
                                    Scaffolds[scaf2].left_nbrs_obs[(scaf1, scaf_side1)] = obs2
                            else:
                                Scaffolds[scaf2].left_nbrs_obs[(scaf1, scaf_side1)] = obs2

                        if (scaf2, scaf_side2) not in G[(scaf1, scaf_side1)]:
                            G.add_edge((scaf2, scaf_side2), (scaf1, scaf_side1), nr_links=1, gap_dist=obs1 + obs2)
                        else:
                            G.edge[(scaf1, scaf_side1)][(scaf2, scaf_side2)]['nr_links'] += 1
                            G.edge[(scaf1, scaf_side1)][(scaf2, scaf_side2)]['gap_dist'] += obs1 + obs2
                    else:
                        reads_with_too_long_insert += 1
                        #fishy_reads[alignedread.qname[:-1]]=[contig2,alignedread.is_read2]
                        ## add to haplotype graph here!!

                    prev_obs1 = obs1
                    prev_obs2 = obs2

                elif contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold:
########################Use to validate scaffold in previous step here ############
                    pass
#        print 'NR OF FISHY EDGES: ', len(fishy_reads)
        print 'USEFUL READS (reads mapping to different contigs): ', count
    #print 'Non unique portion out of "USEFUL READS"  (filtered out from scaffolding): ', non_unique
        #print 'Non unique used for scaf: ', non_unique_for_scaf
        print 'Reads with too large insert size from "USEFUL READS" (filtered out): ', reads_with_too_long_insert
        if param.detect_duplicate:
            print 'Number of duplicated reads indicated and removed: ', nr_of_duplicates

    ##### Calc coverage for all contigs with current lib here #####
        sum_x = 0
        sum_x_sq = 0
        n = 0
        for contig in cont_aligned_len:
            cont_coverage = cont_aligned_len[contig][0] / float(cont_aligned_len[contig][1])
                #print key, cont_aligned_len[key]/float(cont_lengths[i])
            try:
                Contigs[contig].coverage = cont_coverage
            except KeyError:
                pass
            sum_x += cont_coverage
            sum_x_sq += cont_coverage ** 2
            n += 1

        mean_cov, std_dev_cov = CalculateMeanCoverage(Contigs, param.first_lib, output_dest, param.bamfile)
        param.mean_coverage = mean_cov
        param.std_dev_coverage = std_dev_cov


    return(G, Contigs, Scaffolds, F, param)
コード例 #8
0
ファイル: Phenotype.py プロジェクト: JJacobi13/VLPB
 def addContig(self, contigId):
     """The method addContig creates a contig object and adds this object to the list of contigs involved with this phenotype.
     
     """
     self.contigs.append(Contig.Contig(contigId, self))
コード例 #9
0
    def AddEdges(Contigs,Scaffolds,bamfile,mean,std_dev,scaffold_indexer,F,read_len):
        #Clean contig_library
        bam_object = BamParser(bamfile)
        singeled_out=0
        cont_lengths= bam_object.bam_file.lengths
        cont_lengths=[int(nr) for nr in cont_lengths]  #convert long to int object
        #print cont_lengths
        cont_names = bam_object.bam_file.references
        ####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING, 
        ####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE
        for i in range(0,len(cont_names)):
            if cont_lengths[i] >= 300:
                C=Contig.contig(cont_names[i])   # Create object contig
                C.length = cont_lengths[i]
                C.scaf_length = C.length        # Initially, scaffold consists of only this contig
                C.direction = True              # always in same direction first, False=reverse
                C.position = 0                  #position always 0
                C.links = {}
                Contigs[C.name] = C              # Create a dict with name as key and the object container as value
                S=Scaffold.scaffold('s'+str(scaffold_indexer),[C],C.length)  # Create object scaffold
                Scaffolds[S.name]=S
                C.scaffold=S.name
                G.add_node((S.name,'L'),length=cont_lengths[i])
                G.add_node((S.name,'R'),length=cont_lengths[i])
                scaffold_indexer+=1
        
        
        #Create "node graph" of contigs (that passed the length criteria). Each having a left and right node
        #print 'Nr of contigs/scaffolds included in scaffolding: '+ str(len(Scaffolds))#,Scaffolds.keys()
        
        for scaffold_ in Scaffolds:
            G.add_edge((scaffold_,'L'),(scaffold_,'R'),nr_links=None)    #this is a scaffold object but can be both a single contig or a scaffold.
        
        
        # Create the link edges in the graph by fetching info from bam file


        def nr_softclipps(read):
            max_soft = 0
            for type_,length in read.cigar:
                if type_ == 4 and length >= max_soft:
                    max_soft = length
            return max_soft

        global_max_softclipps = 0
        global_min_obs = 100000 
        links_used = 0
        #r_len = float(read_len)
        for read1,read2 in bam_object.unique_reads_on_different_references():
            contig1=bam_object.bam_file.getrname(read1.rname)
            contig2=bam_object.bam_file.getrname(read2.rname)
            max_soft_readpair = max(nr_softclipps(read1),nr_softclipps(read2))
            if max_soft_readpair > global_max_softclipps:
                global_max_softclipps = max_soft_readpair
            # print read1.cigar
            #if read1.qlen/r_len < 0.7 or read2.qlen/r_len < 0.7:
            #    continue
            #     print 'midddle1',o1, o1+o2, read1.pos, read1.mapq,read1.qlen,read1.rlen, read1.cigar, read1.tags
            # if read2.qlen < 50:
            #     print 'midddle2',o2, o1+o2, read2.pos, read2.mapq, read2.qlen,read2.rlen, read2.cigar, read2.tags
            if contig1 in Contigs and contig2 in Contigs:                
                (read_dir,mate_dir) = (not read1.is_reverse,not read2.is_reverse )
                scaf1=Contigs[contig1].scaffold
                scaf2=Contigs[contig2].scaffold                    
                #Calculate actual position on scaffold here
                #position1 cont/scaf1
                cont_dir1 = Contigs[contig1].direction  #if pos : L if neg: R
                cont1_pos = Contigs[contig1].position
                readpos = read1.pos
                cont1_len = Contigs[contig1].length
                s1len = Scaffolds[scaf1].s_length
                #position1 cont1/scaf1                        
                cont_dir2 = Contigs[contig2].direction
                cont2_pos = Contigs[contig2].position
                matepos = read2.pos
                cont2_len = Contigs[contig2].length
                s2len = Scaffolds[scaf2].s_length 
                (obs,scaf_side1,scaf_side2, (o1,o2))=PosDirCalculatorPE(cont_dir1,read_dir,cont1_pos,readpos,s1len,cont1_len,cont_dir2,mate_dir,cont2_pos,matepos,s2len,cont2_len,read_len) 
                if obs < mean+ 4*std_dev: 
                    links_used += 1
                    if (scaf2,scaf_side2) not in G[(scaf1,scaf_side1)]:
                        G.add_edge((scaf2,scaf_side2),(scaf1,scaf_side1),nr_links=1,gap_dist=[obs],obs_pos=set() )
                        G[(scaf2,scaf_side2)][(scaf1,scaf_side1)]['obs_pos'].add((o1,o2))
                        if o1 < global_min_obs:
                            global_min_obs = o1
                        if o2 < global_min_obs:
                            global_min_obs = o2 
                    #print 'Added edge'
                    else:
                        try:
                            if (o1,o2) in G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['obs_pos']:
                                continue
                        except KeyError:
                            #print G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]
                            continue

                        # if (o1,o2) in G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['obs_pos']:
                        #     #print 'detected duplicate'
                        #     continue
                        else:
                            G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['nr_links'] += 1
                            G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['gap_dist'].append(obs)  
                            G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['obs_pos'].add((o1,o2))  
                            G.edge[(scaf1,scaf_side1)][(scaf2,scaf_side2)]['obs_pos'].add((o2,o1))  

                            if o1 < global_min_obs:
                                global_min_obs = o1
                            if o2 < global_min_obs:
                                global_min_obs = o2
                            # if o1 < 50:
                            #     print o1, o1+o2, read1.pos, read1.mapq,read1.qlen,read1.rlen, read1.cigar, read1.tags
                            #     #print fancy_str(read1)
                            # if o2 < 50:
                            #     print o2, o1+o2, read2.pos, read2.mapq, read2.qlen,read2.rlen, read2.cigar, read2.tags
                            #     #print fancy_str(read2)                                
                                

        print 'Max softclipps:', global_max_softclipps
        print 'Min obs:', global_min_obs
        # sys.exit()
        #print 'Nr links used:', links_used
        return global_max_softclipps
コード例 #10
0
ファイル: CreateGraph.py プロジェクト: Freire/gapest
def PE(Contigs, Scaffolds, bamfile, mean, scaffold_indexer, F, read_len):
    G = nx.Graph()
    print "Parsing BAM file..."
    # read_len=50
    # informative_pair={81:(False,True),97:(True,False),113:(False,False),65:(True,True)}
    # I switched to look at mates instead since BWA can give false flag combinations for
    # read-mate when read is mapped but not mate eg 97-149 81-165. But the reverse
    # does not happen.
    informative_pair = {161: (True, False), 145: (False, True), 129: (True, True), 177: (False, False)}
    # threshold=800
    with pysam.Samfile(bamfile, "r") as bam_file:  # once real data, change to 'rb', simulated files are on SAM format
        # Clean contig_library
        singeled_out = 0
        cont_lengths = bam_file.lengths
        cont_lengths = [int(nr) for nr in cont_lengths]  # convert long to int object
        # print cont_lengths
        cont_names = bam_file.references
        ####### WHEN ADDING SHORTER CONTIGS NOT INCLUDED IN THE SCAFFOLDING,
        ####### WE NEED TO ALSO INITIALIZE OBJECTS FOR THESE, THIS SHOULD BE DONE SOMEWHERE HERE
        for i in range(0, len(cont_names)):
            C = Contig.contig(cont_names[i])  # Create object contig
            C.length = cont_lengths[i]
            C.scaf_length = C.length  # Initially, scaffold consists of only this contig
            C.direction = True  # always in same direction first, False=reverse
            C.position = 0  # position always 0
            C.links = {}
            Contigs[C.name] = C  # Create a dict with name as key and the object container as value
            S = Scaffold.scaffold("s" + str(scaffold_indexer), [C], C.length)  # Create object scaffold
            Scaffolds[S.name] = S
            C.scaffold = S.name
            G.add_node((S.name, "L"), length=cont_lengths[i])
            G.add_node((S.name, "R"), length=cont_lengths[i])
            scaffold_indexer += 1

        # Create "node graph" of contigs (that passed the length criteria). Each having a left and right node
        print "Nr of contigs/scaffolds included in scaffolding: " + str(len(Scaffolds))  # ,Scaffolds.keys()

        for scaffold_ in Scaffolds:
            G.add_edge(
                (scaffold_, "L"), (scaffold_, "R"), nr_links=None
            )  # this is a scaffold object but can be both a single contig or a scaffold.

        # Create the link edges in the graph by fetching info from bam file

        for alignedread in bam_file:
            flag_type = alignedread.flag
            if flag_type in informative_pair:
                contig1 = bam_file.getrname(alignedread.rname)
                contig2 = bam_file.getrname(alignedread.mrnm)
                if contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold:
                    (read_dir, mate_dir) = informative_pair[flag_type]
                    scaf1 = Contigs[contig1].scaffold
                    scaf2 = Contigs[contig2].scaffold
                    # Calculate actual position on scaffold here
                    # position1 cont/scaf1
                    cont_dir1 = Contigs[contig1].direction  # if pos : L if neg: R
                    cont1_pos = Contigs[contig1].position
                    readpos = alignedread.pos
                    cont1_len = Contigs[contig1].length
                    s1len = Scaffolds[scaf1].s_length
                    # position1 cont1/scaf1
                    cont_dir2 = Contigs[contig2].direction
                    cont2_pos = Contigs[contig2].position
                    matepos = alignedread.mpos
                    cont2_len = Contigs[contig2].length
                    s2len = Scaffolds[scaf2].s_length
                    (gap, scaf_side1, scaf_side2) = PosDirCalculatorPE(
                        cont_dir1,
                        read_dir,
                        cont1_pos,
                        readpos,
                        s1len,
                        cont1_len,
                        cont_dir2,
                        mate_dir,
                        cont2_pos,
                        matepos,
                        s2len,
                        cont2_len,
                        read_len,
                    )
                    if (scaf2, scaf_side2) not in G[(scaf1, scaf_side1)]:
                        G.add_edge((scaf2, scaf_side2), (scaf1, scaf_side1), nr_links=1, gap_dist=[gap])
                        # print 'Added edge'
                    else:
                        G.edge[(scaf1, scaf_side1)][(scaf2, scaf_side2)]["nr_links"] += 1
                        # print 'edge'
                        G.edge[(scaf1, scaf_side1)][(scaf2, scaf_side2)]["gap_dist"].append(gap)

                elif (
                    contig1 in Contigs and contig2 in Contigs and Contigs[contig2].scaffold != Contigs[contig1].scaffold
                ):
                    ########################Use to validate scaffold herein previous step here
                    pass
    # for edge in G.edges():
    #    if G[edge[0]][edge[1]]['nr_reads']:
    #        print G[edge[0]][edge[1]]['gap_dist']

    # print G.edges(data=True)
    return (G, Contigs, Scaffolds, F, scaffold_indexer)