Beispiel #1
def get_contamination_metrics(largest_contigs_indexes, bam_file, cont_names,
                              param, Information):
    iter_threshold = 1000000
    counter_total = 0
    count_contamine = 0

    contamination_reads = []
    sample_counter = 0

    # for index in largest_contigs_indexes:
    #     try:
    #         iter_ = bam_file.fetch(cont_names[index])
    #     except ValueError:
    #         sys.stderr.write('Need indexed bamfiles, index file should be located in the same directory as the BAM file\nterminating..\n')
    #         sys.exit(0)
    for read in bam_file:
        # all reads mapping
        if read.rname in largest_contigs_indexes:
            sample_counter += 1
            if not read.is_unmapped:  ##read.tid == read.rnext and not read.mate_is_unmapped and not read.is_unmapped: #
                counter_total += 1

            # contamination reads (mapped in reverse complemented orientation)
            if param.orientation == 'fr' and bam_parser.is_proper_aligned_unique_outie(
                    read, param.min_mapq):
                frag_size = abs(read.tlen) + 2 * param.read_len
                if param.read_len < frag_size:
                    count_contamine += 2

            if param.orientation == 'rf' and bam_parser.is_proper_aligned_unique_innie(
                    read, param.min_mapq):
                frag_size = abs(read.tlen)
                if param.read_len < frag_size:
                    count_contamine += 2

            if sample_counter >= iter_threshold:

    ## SMOOTH OUT contamine distribution here by removing extreme observations##

    n_contamine = float(len(contamination_reads))
    mean_isize = 0
    std_dev_isize = 0
    if n_contamine > 2:
        mean_isize = sum(contamination_reads) / n_contamine
        std_dev_isize = (sum(
                map((lambda x: x**2 - 2 * x * mean_isize + mean_isize**2),
                    contamination_reads))) / (n_contamine - 1))**0.5
        print('Contamine mean before filtering :',
        print('Contamine stddev before filtering: ',
        # #new method to find distribution
        # dist1, dist2, mean_isize, std_dev_isize, outliers_mean, outliers_stddev = find_bimodality.split_distribution(contamination_reads)
        # n_contamine = len(dist1)
        extreme_obs_occur = True
        while extreme_obs_occur:
            extreme_obs_occur, filtered_list = AdjustInsertsizeDist(
                param, mean_isize, std_dev_isize, contamination_reads)
            n_contamine = float(len(filtered_list))
            if n_contamine > 2:
                mean_isize = sum(filtered_list) / n_contamine
                std_dev_isize = (sum(
                        map((lambda x: x**2 - 2 * x * mean_isize + mean_isize**
                             2), filtered_list))) / (n_contamine - 1))**0.5
                contamination_reads = filtered_list
        print('Contamine mean converged:', mean_isize, file=Information)
        print('Contamine std_est converged: ', std_dev_isize, file=Information)

    if counter_total > 0:
        contamination_ratio = 2 * n_contamine / float(counter_total)
        contamination_ratio = 0

    if mean_isize >= param.mean_ins_size or std_dev_isize >= param.std_dev_ins_size or contamination_ratio <= 0.05:
        # either contamine mean or stddev is higher than MP lb mean which means it's spurious alignments or
        # other wierd thing -> no true PE-contamine as artifact discribed in the illumina MP construction protocol.
        # or we have less than 5% of contamine reads. Then we skip dealing with them since they constitute such a small
        # fraction of the total reads and introduce more complexity when orienting scaffolds in pathfinder module
        param.contamination_ratio = False
        param.contamination_mean = 0
        param.contamination_stddev = 0
        param.contamination_mean = mean_isize
        param.contamination_stddev = std_dev_isize
        param.contamination_ratio = contamination_ratio

    return n_contamine
Beispiel #2
def get_contamination_metrics(largest_contigs_indexes, bam_file, cont_names, param, Information):
    iter_threshold = 1000000
    counter_total= 0
    count_contamine = 0

    contamination_reads = []
    sample_counter = 0

    # for index in largest_contigs_indexes:
    #     try:
    #         iter_ = bam_file.fetch(cont_names[index])
    #     except ValueError:
    #         sys.stderr.write('Need indexed bamfiles, index file should be located in the same directory as the BAM file\nterminating..\n')
    #         sys.exit(0)
    for read in bam_file:
        # all reads mapping
        if read.rname in largest_contigs_indexes:
            sample_counter += 1
            if not read.is_unmapped: ##read.tid == read.rnext and not read.mate_is_unmapped and not read.is_unmapped: #
                counter_total += 1

            # contamination reads (mapped in reverse complemented orientation)
            if  param.orientation == 'fr' and bam_parser.is_proper_aligned_unique_outie(read):
                frag_size = abs(read.tlen)+2*param.read_len
                if param.read_len < frag_size:
                    count_contamine += 2

            if  param.orientation == 'rf' and bam_parser.is_proper_aligned_unique_innie(read):
                frag_size = abs(read.tlen)
                if param.read_len < frag_size:
                    count_contamine += 2

            if sample_counter >= iter_threshold:

    ## SMOOTH OUT contamine distribution here by removing extreme observations##

    n_contamine = float(len(contamination_reads))
    mean_isize = 0
    std_dev_isize = 0
    if n_contamine > 2:
        mean_isize = sum(contamination_reads) / n_contamine
        std_dev_isize = (sum(list(map((lambda x: x ** 2 - 2 * x * mean_isize + mean_isize ** 2), contamination_reads))) / (n_contamine - 1)) ** 0.5
        print >> Information, 'Contamine mean before filtering :', mean_isize
        print >> Information, 'Contamine stddev before filtering: ', std_dev_isize
        # #new method to find distribution
        # dist1, dist2, mean_isize, std_dev_isize, outliers_mean, outliers_stddev = find_bimodality.split_distribution(contamination_reads)
        # n_contamine = len(dist1)
        extreme_obs_occur = True
        while extreme_obs_occur:
            extreme_obs_occur, filtered_list = AdjustInsertsizeDist(param, mean_isize, std_dev_isize, contamination_reads)
            n_contamine = float(len(filtered_list))
            if n_contamine > 2:
                mean_isize = sum(filtered_list) / n_contamine
                std_dev_isize = (sum(list(map((lambda x: x ** 2 - 2 * x * mean_isize + mean_isize ** 2), filtered_list))) / (n_contamine - 1)) ** 0.5
                contamination_reads = filtered_list
        print >> Information, 'Contamine mean converged:', mean_isize
        print >> Information, 'Contamine std_est converged: ', std_dev_isize

    contamination_ratio = 2*n_contamine / float(counter_total)

    if mean_isize >= param.mean_ins_size or std_dev_isize >= param.std_dev_ins_size or contamination_ratio <= 0.05:
        # either contamine mean or stddev is higher than MP lb mean which means it's spurious alignments or
        # other wierd thing -> no true PE-contamine as artifact discribed in the illumina MP construction protocol.
        # or we have less than 5% of contamine reads. Then we skip dealing with them since they constitute such a small
        # fraction of the total reads and introduce more complexity when orienting scaffolds in pathfinder module
        param.contamination_ratio = False
        param.contamination_mean = 0
        param.contamination_stddev = 0
        param.contamination_mean = mean_isize
        param.contamination_stddev = std_dev_isize
        param.contamination_ratio = contamination_ratio

    return n_contamine
Beispiel #3
def get_metrics(bam_file, param, Information):
    #informative_pair = set([147, 163]) #161,145,129,177,
    cont_names = bam_file.references
    cont_lengths = bam_file.lengths
    #cont_lengths=[int(nr) for nr in cont_lengths]  #convert long to int object
    cont_lengths_list = list(cont_lengths)
    indexes = [i for i in range(0, len(cont_lengths_list))]
    largest_contigs_indexes = set(
        nlargest(1000, indexes, key=lambda i: cont_lengths_list[i])
    )  #get indexes of the 1000 longest contigs

    param.lognormal = False  #default as False, but cna change below dependant on skew

    except ValueError:
            'Need indexed bamfiles, index file should be located in the same directory as the BAM file\nterminating..\n'
    #largest_contigs_indexes = nlargest(1000, indexes, key=lambda i: cont_lengths_list[i]) #get indexes of the 1000 longest contigs

    #print largest_contigs_indexes

    if not param.read_len:  # user has not specified read len
        #get read length
        nr_reads = 0
        tot_read_len = 0
        # for index in largest_contigs_indexes:
        #     try:
        #         iter_ = bam_file.fetch(cont_names[index])
        #     except ValueError:
        #         sys.stderr.write('Need indexed bamfiles, index file should be located in the same directory as the BAM file\nterminating..\n')
        #         sys.exit(0)

        for read in bam_file:
            if read.rlen != 0:
                tot_read_len += read.rlen
                nr_reads += 1
                tot_read_len += read.alen
                nr_reads += 1
            if nr_reads >= 1000:
                param.read_len = tot_read_len / float(nr_reads)
            sys.stderr.write('Did not get sufficient readmappings to calculate\
             read_length from mappings. Got {0} mappings. Please provide this parameter or more importantly\
             check why almost no reads are mapping to the contigs.\nterminating..\n'


    if param.mean_ins_size and param.std_dev_ins_size and not param.ins_size_threshold:  # user has specified mean and std dev but no thresholds
        param.ins_size_threshold = param.mean_ins_size + 6 * param.std_dev_ins_size
        if param.extend_paths:
            param.contig_threshold = param.mean_ins_size + 4 * param.std_dev_ins_size
            param.contig_threshold = param.mean_ins_size + (
                param.std_dev_ins_size /
                float(param.mean_ins_size)) * param.std_dev_ins_size

    if not param.mean_ins_size:  # user has not specified mean and std dev. (and no thresholds)
        #total_reads_iterated_through = 0
        counter = 1
        ins_size_reads = []
        # for index in largest_contigs_indexes:
        #     try:
        #         iter_ = bam_file.fetch(cont_names[index])
        #     except ValueError:
        #         sys.stderr.write('Need indexed bamfiles, index file should be located in the same directory as the BAM file\nterminating..\n')
        #         sys.exit(0)
        for read in bam_file:
            if param.orientation == 'fr' and bam_parser.is_proper_aligned_unique_innie(
                    read, param.min_mapq):
                if read.rname in largest_contigs_indexes:
                    counter += 1
            if param.orientation == 'rf' and bam_parser.is_proper_aligned_unique_outie(
                    read, param.min_mapq):
                if read.rname in largest_contigs_indexes:
                    ins_size_reads.append(abs(read.tlen) + 2 * param.read_len)
                    counter += 1
            if counter > 1000000:
        # if counter > 1000000:
        #     break
            "Estimating insert size from {0} mappings with quality over --min_mapq {1}."
            .format(counter, param.min_mapq))
            "Estimating insert size from {0} mappings with quality over --min_mapq {1}."
            .format(counter, param.min_mapq),
        #get mean and std dev here.
        #Assure that there were enough reads  for computation of mean and variance
        if len(ins_size_reads) <= 1000:
                'To few valid read alignments exists to compute mean and variance of library (need at least 1000 observations). Got only '
                + str(len(ins_size_reads)) +
                ' valid alignments. Please specify -m and -s to the program. \nPrinting out scaffolds produced in earlier steps...'

        ## SMOOTH OUT THE MEAN HERE by removing extreme observations##
        n = float(len(ins_size_reads))
        mean_isize = sum(ins_size_reads) / n
        std_dev_isize = (sum(
                map((lambda x: x**2 - 2 * x * mean_isize + mean_isize**2),
                    ins_size_reads))) / (n - 1))**0.5
        print('Mean before filtering :', mean_isize, file=Information)
        print('Std_est  before filtering: ', std_dev_isize, file=Information)
        extreme_obs_occur = True
        while extreme_obs_occur:
            extreme_obs_occur, filtered_list = AdjustInsertsizeDist(
                param, mean_isize, std_dev_isize, ins_size_reads)
            n = float(len(filtered_list))
            mean_isize = sum(filtered_list) / n
            std_dev_isize = (sum(
                    map((lambda x: x**2 - 2 * x * mean_isize + mean_isize**2),
                        filtered_list))) / (n - 1))**0.5
            ins_size_reads = filtered_list

        n = float(len(ins_size_reads))
        mean_isize = sum(ins_size_reads) / n
        std_dev_isize = (sum(
                map((lambda x: x**2 - 2 * x * mean_isize + mean_isize**2),
                    ins_size_reads))) / (n - 1))**0.5

        print('Mean converged:', mean_isize, file=Information)
        print('Std_est converged: ', std_dev_isize, file=Information)

        param.mean_ins_size = mean_isize
        param.std_dev_ins_size = std_dev_isize

        m_3 = sum([(x - mean_isize)**3 for x in ins_size_reads]) / n
        skewness = m_3 / std_dev_isize**3
        param.skewness = skewness
        print('Skewness of distribution: ', param.skewness, file=Information)

        # weight each observation with how likely it is to see it
        adj_distr, mu_adj, sigma_adj, skew_adj, median_adj, mode_adj = getdistr(
            ins_size_reads, cont_lengths_list, param, Information)
        param.skew_adj = skew_adj
        param.empirical_distribution = dict(
                     adj_distr)))  #Counter(ins_size_reads)
        print('Mean of getdistr adjusted distribution: ',
        print('Sigma of getdistr adjusted distribution: ',
        print('Skewness of getdistr adjusted distribution: ',
        print('Median of getdistr adjusted distribution: ',
        print('Mode of getdistr adjusted distribution: ',
            'Using mean and stddev of getdistr adjusted distribution from here: ',
        param.mean_ins_size = mu_adj
        param.std_dev_ins_size = sigma_adj

        #### If skewness (of original - not the getdistr)is positive and larger than 0.5
        #### (big enough skew to have impact), we fit to the lognormal distribution
        if param.skew_adj > 0.5 and math.log(median_adj) > math.log(mode_adj):

            #### NOTE: Fitting lognormal of original sample, not getdistr adjusted for
            #### smaller isizes observation bias
            #### because I don't know how yet. If the two distributions are not too unsimilar
            #### it should be a good approximation in practice
            # median = sorted(ins_size_reads)[len(ins_size_reads)/2]
            # mode = mean_isize - 3*(mean_isize - median)
            # print >> Information, 'Mode on initial sample (not getdistr adjusted): ', mode
            # print >> Information, "Median on initial sample (not getdistr adjusted)", median
            # print "mode:", mode
            # print "median", median
            # param.lognormal_mean = math.log(median)
            # param.lognormal_sigma = math.sqrt(param.lognormal_mean - math.log(mode))
            # print >> Information, 'Lognormal mean (not getdistr adjusted): ', param.lognormal_mean
            # print >> Information, "Lognormal stddev (not getdistr adjusted)", param.lognormal_sigma

            ## the statistics for the getdistr adjusted distribution
            #mode_adj = mu_adj - 3*(mu_adj - median_adj)
            print('Mode on getdistr adjusted: ', mode_adj, file=Information)
            print("Median on getdistr adjusted:", median_adj, file=Information)
            print("mode adj:", mode_adj)
            print("median adj", median_adj)
            param.lognormal_mean = math.log(median_adj)
            param.lognormal_sigma = math.sqrt(param.lognormal_mean -
            print('Lognormal mean getdistr adjusted: ',
            print("Lognormal stddev getdistr adjusted",

            param.lognormal = True

        # TODO: calculate skew of contamination distribution

        # stddev_fit = (sum(map(lambda x: (x - mode)**2 , ins_size_reads))/n)**0.5
        # print stddev_fit

        # import matplotlib.pyplot as plt
        # plt.hist(ins_size_reads,100)
        # plt.savefig("/Users/ksahlin/_tmp/BESST_ILP/ARABI_27_statistical_score_no_paths/isize_plot")
        total_reads_iterated_through = None

    if not param.ins_size_threshold:
        param.ins_size_threshold = param.mean_ins_size + 6 * param.std_dev_ins_size
        if param.extend_paths:
            param.contig_threshold = param.mean_ins_size + 4 * param.std_dev_ins_size
            param.contig_threshold = param.mean_ins_size + (
                param.std_dev_ins_size /
                float(param.mean_ins_size)) * param.std_dev_ins_size

    ## finally, get a reverse complemented read contamination distribution from the MP library if it exists
    n_contamine = get_contamination_metrics(largest_contigs_indexes, bam_file,
                                            cont_names, param, Information)

    print('', file=Information)
    print('LIBRARY STATISTICS', file=Information)
    print('Mean of library set to:', param.mean_ins_size, file=Information)
    print('Standard deviation of library set to: ',
    print('MP library PE contamination:', file=Information)
    print('Contamine rate (rev comp oriented) estimated to: ',
    print('lib contamine mean (avg fragmentation size): ',
    print('lib contamine stddev: ',
    print('Number of contamined reads used for this calculation: ',

    print('-T (library insert size threshold) set to: ',
    print('-k set to (Scaffolding with contigs larger than): ',
    print('Number of links required to create an edge: ',
        'Maximum identical contig-end overlap-length to merge of contigs that are adjacent in a scaffold: ',
    print('Read length set to: ', param.read_len, file=Information)
    print('', file=Information)
    return ()
Beispiel #4
def get_metrics(bam_file, param, Information):
    #informative_pair = set([147, 163]) #161,145,129,177,
    cont_names = bam_file.references
    cont_lengths = bam_file.lengths
    #cont_lengths=[int(nr) for nr in cont_lengths]  #convert long to int object
    cont_lengths_list = list(cont_lengths)
    indexes = [i for i in range(0, len(cont_lengths_list))]
    largest_contigs_indexes = set(nlargest(1000, indexes, key=lambda i: cont_lengths_list[i])) #get indexes of the 1000 longest contigs

    param.lognormal = False #default as False, but cna change below dependant on skew

    except ValueError:
        sys.stderr.write('Need indexed bamfiles, index file should be located in the same directory as the BAM file\nterminating..\n')
    #largest_contigs_indexes = nlargest(1000, indexes, key=lambda i: cont_lengths_list[i]) #get indexes of the 1000 longest contigs
    #print largest_contigs_indexes

    if not param.read_len: # user has not specified read len  
        #get read length
        nr_reads = 0
        tot_read_len = 0
        # for index in largest_contigs_indexes:
        #     try:
        #         iter_ = bam_file.fetch(cont_names[index])
        #     except ValueError:
        #         sys.stderr.write('Need indexed bamfiles, index file should be located in the same directory as the BAM file\nterminating..\n')
        #         sys.exit(0)

        for read in bam_file:
            if read.rlen != 0:
                tot_read_len += read.rlen
                nr_reads += 1
                tot_read_len += read.alen
                nr_reads += 1
            if nr_reads >= 100:
                param.read_len = tot_read_len / float(nr_reads)
            sys.stderr.write('Did not get sufficient readmappings to calculate\
             read_length from mappings. Got {0} mappings. Please provide this parameter or more importantly\
             check why almost no reads are mapping to the contigs.\nterminating..\n'.format(nr_reads))


    if param.mean_ins_size and param.std_dev_ins_size and not param.ins_size_threshold: # user has specified mean and std dev but no thresholds
        param.ins_size_threshold = param.mean_ins_size + 6 * param.std_dev_ins_size
        if param.extend_paths:
            param.contig_threshold = param.mean_ins_size + 4 * param.std_dev_ins_size
            param.contig_threshold = param.mean_ins_size + (param.std_dev_ins_size / float(param.mean_ins_size)) * param.std_dev_ins_size
        print >> Information, '-T', param.ins_size_threshold, '-t', param.contig_threshold

    if not param.mean_ins_size: # user has not specified mean and std dev. (and no thresholds)
        #total_reads_iterated_through = 0
        counter = 1
        ins_size_reads = []
        # for index in largest_contigs_indexes:
        #     try:
        #         iter_ = bam_file.fetch(cont_names[index])
        #     except ValueError:
        #         sys.stderr.write('Need indexed bamfiles, index file should be located in the same directory as the BAM file\nterminating..\n')
        #         sys.exit(0)
        for read in bam_file:
            if param.orientation == 'fr' and bam_parser.is_proper_aligned_unique_innie(read):
                if read.rname in largest_contigs_indexes:
                    counter += 1
            if param.orientation == 'rf' and bam_parser.is_proper_aligned_unique_outie(read):
                if read.rname in largest_contigs_indexes:
                    ins_size_reads.append(abs(read.tlen) + 2*param.read_len)
                    counter += 1
            if counter > 1000000:
        # if counter > 1000000:
        #     break

        #get mean and std dev here. 
        #Assure that there were enough reads  for computation of mean and variance
        if len(ins_size_reads) <= 1000:
            sys.stderr.write('To few valid read alignments exists to compute mean and variance of library (need at least 1000 observations). Got only ' + str(len(ins_size_reads)) + ' valid alignments. Please specify -m and -s to the program. \nPrinting out scaffolds produced in earlier steps...')

        ## SMOOTH OUT THE MEAN HERE by removing extreme observations## 
        n = float(len(ins_size_reads))
        mean_isize = sum(ins_size_reads) / n
        std_dev_isize = (sum(list(map((lambda x: x ** 2 - 2 * x * mean_isize + mean_isize ** 2), ins_size_reads))) / (n - 1)) ** 0.5
        print >> Information, 'Mean before filtering :', mean_isize
        print >> Information, 'Std_est  before filtering: ', std_dev_isize
        extreme_obs_occur = True
        while extreme_obs_occur:
            extreme_obs_occur, filtered_list = AdjustInsertsizeDist(param, mean_isize, std_dev_isize, ins_size_reads)
            n = float(len(filtered_list))
            mean_isize = sum(filtered_list) / n
            std_dev_isize = (sum(list(map((lambda x: x ** 2 - 2 * x * mean_isize + mean_isize ** 2), filtered_list))) / (n - 1)) ** 0.5
            ins_size_reads = filtered_list

        n = float(len(ins_size_reads))
        mean_isize = sum(ins_size_reads) / n
        std_dev_isize = (sum(list(map((lambda x: x ** 2 - 2 * x * mean_isize + mean_isize ** 2), ins_size_reads))) / (n - 1)) ** 0.5

        print >> Information, 'Mean converged:', mean_isize
        print >> Information, 'Std_est converged: ', std_dev_isize

        param.mean_ins_size = mean_isize
        param.std_dev_ins_size = std_dev_isize

        m_3 = sum(map(lambda x: (x - mean_isize) ** 3, ins_size_reads))/n
        skewness = m_3 / std_dev_isize**3
        param.skewness = skewness
        print >> Information, 'Skewness of distribution: ', param.skewness

        # weight each observation with how likely it is to see it
        adj_distr, mu_adj, sigma_adj, skew_adj, median_adj, mode_adj = getdistr(ins_size_reads, cont_lengths_list, param, Information)
        param.skew_adj = skew_adj
        param.empirical_distribution = dict(zip(range(len(adj_distr)), adj_distr)) #Counter(ins_size_reads)
        print >> Information, 'Mean of getdistr adjusted distribution: ', mu_adj
        print >> Information, 'Sigma of getdistr adjusted distribution: ', sigma_adj
        print >> Information, 'Skewness of getdistr adjusted distribution: ', skew_adj
        print >> Information, 'Median of getdistr adjusted distribution: ', median_adj
        print >> Information, 'Mode of getdistr adjusted distribution: ', mode_adj
        print >> Information, 'Using mean and stddev of getdistr adjusted distribution from here: ', mu_adj, sigma_adj
        param.mean_ins_size = mu_adj
        param.std_dev_ins_size = sigma_adj

        #### If skewness (of original - not the getdistr)is positive and larger than 0.5 
        #### (big enough skew to have impact), we fit to the lognormal distribution 
        if param.skew_adj > 0.5 and math.log(median_adj) > math.log(mode_adj):

            #### NOTE: Fitting lognormal of original sample, not getdistr adjusted for 
            #### smaller isizes observation bias
            #### because I don't know how yet. If the two distributions are not too unsimilar
            #### it should be a good approximation in practice
            # median = sorted(ins_size_reads)[len(ins_size_reads)/2]
            # mode = mean_isize - 3*(mean_isize - median)
            # print >> Information, 'Mode on initial sample (not getdistr adjusted): ', mode
            # print >> Information, "Median on initial sample (not getdistr adjusted)", median
            # print "mode:", mode
            # print "median", median
            # param.lognormal_mean = math.log(median)
            # param.lognormal_sigma = math.sqrt(param.lognormal_mean - math.log(mode))
            # print >> Information, 'Lognormal mean (not getdistr adjusted): ', param.lognormal_mean
            # print >> Information, "Lognormal stddev (not getdistr adjusted)", param.lognormal_sigma

            ## the statistics for the getdistr adjusted distribution
            #mode_adj = mu_adj - 3*(mu_adj - median_adj)
            print >> Information, 'Mode on getdistr adjusted: ', mode_adj
            print >> Information, "Median on getdistr adjusted:", median_adj
            print "mode adj:", mode_adj
            print "median adj", median_adj
            param.lognormal_mean = math.log(median_adj)
            param.lognormal_sigma = math.sqrt(param.lognormal_mean - math.log(mode_adj))
            print >> Information, 'Lognormal mean getdistr adjusted: ', param.lognormal_mean
            print >> Information, "Lognormal stddev getdistr adjusted", param.lognormal_sigma

            param.lognormal = True

        # TODO: calculate skew of contamination distribution

        # stddev_fit = (sum(map(lambda x: (x - mode)**2 , ins_size_reads))/n)**0.5
        # print stddev_fit
        # import matplotlib.pyplot as plt
        # plt.hist(ins_size_reads,100)
        # plt.savefig("/Users/ksahlin/_tmp/BESST_ILP/ARABI_27_statistical_score_no_paths/isize_plot")
        total_reads_iterated_through = None

    if not param.ins_size_threshold:
        param.ins_size_threshold = param.mean_ins_size + 6 * param.std_dev_ins_size
        if param.extend_paths:
            param.contig_threshold = param.mean_ins_size + 4 * param.std_dev_ins_size
            param.contig_threshold = param.mean_ins_size + (param.std_dev_ins_size / float(param.mean_ins_size)) * param.std_dev_ins_size

    ## finally, get a reverse complemented read contamination distribution from the MP library if it exists
    n_contamine = get_contamination_metrics(largest_contigs_indexes,bam_file, cont_names, param, Information)

    print >> Information, ''
    print >> Information, 'LIBRARY STATISTICS'
    print >> Information, 'Mean of library set to:', param.mean_ins_size
    print >> Information, 'Standard deviation of library set to: ', param.std_dev_ins_size
    print >> Information, 'MP library PE contamination:'
    print >> Information, 'Contamine rate (rev comp oriented) estimated to: ',  param.contamination_ratio
    print >> Information, 'lib contamine mean (avg fragmentation size): ', param.contamination_mean
    print >> Information, 'lib contamine stddev: ', param.contamination_stddev 
    print >> Information, 'Number of contamined reads used for this calculation: ', n_contamine

    print >> Information, '-T (library insert size threshold) set to: ', param.ins_size_threshold
    print >> Information, '-k set to (Scaffolding with contigs larger than): ', param.contig_threshold
    print >> Information, 'Number of links required to create an edge: ', param.edgesupport
    print >> Information, 'Maximum identical contig-end overlap-length to merge of contigs that are adjacent in a scaffold: ', param.max_contig_overlap
    print >> Information, 'Read length set to: ', param.read_len
    print >> Information, ''
Beispiel #5
def get_contamination_metrics(largest_contigs_indexes, bam_file, cont_names,
                              param, Information):
    iter_threshold = 1000000
    counter_total = 0
    count_contamine = 0

    contamination_reads = []
    counter = 0

    # for index in largest_contigs_indexes:
    #     try:
    #         iter_ = bam_file.fetch(cont_names[index])
    #     except ValueError:
    #         sys.stderr.write('Need indexed bamfiles, index file should be located in the same directory as the BAM file\nterminating..\n')
    #         sys.exit(0)
    for read in bam_file:
        # all reads mapping
        if read.rname in largest_contigs_indexes:
            counter += 1
            if not read.is_unmapped:  ##read.tid == read.rnext and not read.mate_is_unmapped and not read.is_unmapped: #
                counter_total += 1

            # contamination reads (mapped in reverse complemented orientation)
            if param.orientation == 'fr' and bam_parser.is_proper_aligned_unique_outie(
                if read.rname in largest_contigs_indexes:
                        abs(read.tlen) + 2 * param.read_len)
                    count_contamine += 2
            if param.orientation == 'rf' and bam_parser.is_proper_aligned_unique_innie(
                if read.rname in largest_contigs_indexes:
                    count_contamine += 2
            if counter >= iter_threshold:

    ## SMOOTH OUT contamine distribution here by removing extreme observations##
    n_contamine = float(len(contamination_reads))
    mean_isize = 0
    std_dev_isize = 0
    if n_contamine > 2:
        mean_isize = sum(contamination_reads) / n_contamine
        std_dev_isize = (sum(
                map((lambda x: x**2 - 2 * x * mean_isize + mean_isize**2),
                    contamination_reads))) / (n_contamine - 1))**0.5
        print >> Information, 'Contamine mean before filtering :', mean_isize
        print >> Information, 'Contamine stddev before filtering: ', std_dev_isize
        extreme_obs_occur = True
        while extreme_obs_occur:
            extreme_obs_occur, filtered_list = AdjustInsertsizeDist(
                mean_isize, std_dev_isize, contamination_reads)
            n_contamine = float(len(filtered_list))
            if n_contamine > 2:
                mean_isize = sum(filtered_list) / n_contamine
                std_dev_isize = (sum(
                        map((lambda x: x**2 - 2 * x * mean_isize + mean_isize**
                             2), filtered_list))) / (n_contamine - 1))**0.5
                contamination_reads = filtered_list
        print >> Information, 'Contamine mean converged:', mean_isize
        print >> Information, 'Contamine std_est converged: ', std_dev_isize

    if mean_isize >= param.mean_ins_size or std_dev_isize >= param.std_dev_ins_size or n_contamine <= 1000:
        # either contamine mean or stddev is higher than MP lb mean which means it's spurious alignments -> no true contamine
        # or we have less than 0.1% of contamine reads, then we skip dealing with them since they introduce more complexity
        # when orienting scaffolds in pathfinder module
        param.contamination_ratio = False
        param.contamination_mean = 0
        param.contamination_stddev = 0
        param.contamination_mean = mean_isize
        param.contamination_stddev = std_dev_isize
        param.contamination_ratio = 2 * n_contamine / float(counter_total)

    return n_contamine
Beispiel #6
def get_metrics(bam_file, param, Information):
    #informative_pair = set([147, 163]) #161,145,129,177,
    cont_names = bam_file.references
    cont_lengths = bam_file.lengths
    #cont_lengths=[int(nr) for nr in cont_lengths]  #convert long to int object
    cont_lengths_list = list(cont_lengths)
    indexes = [i for i in range(0, len(cont_lengths_list))]
    largest_contigs_indexes = set(
        nlargest(1000, indexes, key=lambda i: cont_lengths_list[i])
    )  #get indexes of the 1000 longest contigs

    except ValueError:
            'Need indexed bamfiles, index file should be located in the same directory as the BAM file\nterminating..\n'
    #largest_contigs_indexes = nlargest(1000, indexes, key=lambda i: cont_lengths_list[i]) #get indexes of the 1000 longest contigs

    #print largest_contigs_indexes

    if not param.read_len:  # user has not specified read len
        #get read length
        nr_reads = 0
        tot_read_len = 0
        # for index in largest_contigs_indexes:
        #     try:
        #         iter_ = bam_file.fetch(cont_names[index])
        #     except ValueError:
        #         sys.stderr.write('Need indexed bamfiles, index file should be located in the same directory as the BAM file\nterminating..\n')
        #         sys.exit(0)

        for read in bam_file:
            if read.rlen != 0:
                tot_read_len += read.rlen
                nr_reads += 1
                tot_read_len += read.alen
                nr_reads += 1
            if nr_reads >= 100:
                param.read_len = tot_read_len / float(nr_reads)
            sys.stderr.write('Did not get sufficient readmappings to calculate\
             read_length from mappings. Got {0} mappings. Please provide this parameter or more importantly\
             check why almost no reads are mapping to the contigs.\nterminating..\n'


    if param.mean_ins_size and param.std_dev_ins_size and not param.ins_size_threshold:  # user has specified mean and std dev but no thresholds
        param.ins_size_threshold = param.mean_ins_size + 4 * param.std_dev_ins_size
        if param.extend_paths:
            param.contig_threshold = param.ins_size_threshold
            param.contig_threshold = param.mean_ins_size + (
                param.std_dev_ins_size /
                float(param.mean_ins_size)) * param.std_dev_ins_size
        print >> Information, '-T', param.ins_size_threshold, '-t', param.contig_threshold

    if not param.mean_ins_size:  # user has not specified mean and std dev. (and no thresholds)
        #total_reads_iterated_through = 0
        counter = 1
        ins_size_reads = []
        # for index in largest_contigs_indexes:
        #     try:
        #         iter_ = bam_file.fetch(cont_names[index])
        #     except ValueError:
        #         sys.stderr.write('Need indexed bamfiles, index file should be located in the same directory as the BAM file\nterminating..\n')
        #         sys.exit(0)
        for read in bam_file:
            if param.orientation == 'fr' and bam_parser.is_proper_aligned_unique_innie(
                if read.rname in largest_contigs_indexes:
                    counter += 1
            if param.orientation == 'rf' and bam_parser.is_proper_aligned_unique_outie(
                if read.rname in largest_contigs_indexes:
                    ins_size_reads.append(abs(read.tlen) + 2 * param.read_len)
                    counter += 1
            if counter > 1000000:
        # if counter > 1000000:
        #     break

        #get mean and std dev here.
        #Assure that there were enough reads  for computation of mean and variance
        if len(ins_size_reads) <= 1000:
                'To few valid read alignments exists to compute mean and variance of library (need at least 1000 observations). Got only '
                + str(len(ins_size_reads)) +
                ' valid alignments. Please specify -m and -s to the program. \nPrinting out scaffolds produced in earlier steps...'

        ## SMOOTH OUT THE MEAN HERE by removing extreme observations##
        n = float(len(ins_size_reads))
        mean_isize = sum(ins_size_reads) / n
        std_dev_isize = (sum(
                map((lambda x: x**2 - 2 * x * mean_isize + mean_isize**2),
                    ins_size_reads))) / (n - 1))**0.5
        print >> Information, 'Mean before filtering :', mean_isize
        print >> Information, 'Std_est  before filtering: ', std_dev_isize
        extreme_obs_occur = True
        while extreme_obs_occur:
            extreme_obs_occur, filtered_list = AdjustInsertsizeDist(
                mean_isize, std_dev_isize, ins_size_reads)
            n = float(len(filtered_list))
            mean_isize = sum(filtered_list) / n
            std_dev_isize = (sum(
                    map((lambda x: x**2 - 2 * x * mean_isize + mean_isize**2),
                        filtered_list))) / (n - 1))**0.5
            ins_size_reads = filtered_list

        n = float(len(ins_size_reads))
        mean_isize = sum(ins_size_reads) / n
        std_dev_isize = (sum(
                map((lambda x: x**2 - 2 * x * mean_isize + mean_isize**2),
                    ins_size_reads))) / (n - 1))**0.5

        print >> Information, 'Mean converged:', mean_isize
        print >> Information, 'Std_est converged: ', std_dev_isize

        param.mean_ins_size = mean_isize
        param.std_dev_ins_size = std_dev_isize
        total_reads_iterated_through = None

    if not param.ins_size_threshold:
        param.ins_size_threshold = param.mean_ins_size + 4 * param.std_dev_ins_size
        if param.extend_paths:
            param.contig_threshold = param.ins_size_threshold
            param.contig_threshold = param.mean_ins_size + (
                param.std_dev_ins_size /
                float(param.mean_ins_size)) * param.std_dev_ins_size

    ## finally, get a reverse complemented read contamination distribution from the MP library if it exists
    n_contamine = get_contamination_metrics(largest_contigs_indexes, bam_file,
                                            cont_names, param, Information)

    print >> Information, ''
    print >> Information, 'LIBRARY STATISTICS'
    print >> Information, 'Mean of library set to:', param.mean_ins_size
    print >> Information, 'Standard deviation of library set to: ', param.std_dev_ins_size
    print >> Information, 'MP library PE contamination:'
    print >> Information, 'Contamine rate (rev comp oriented) estimated to: ', param.contamination_ratio
    print >> Information, 'lib contamine mean (avg fragmentation size): ', param.contamination_mean
    print >> Information, 'lib contamine stddev: ', param.contamination_stddev
    print >> Information, 'Number of contamined reads used for this calculation: ', n_contamine

    print >> Information, '-T (library insert size threshold) set to: ', param.ins_size_threshold
    print >> Information, '-k set to (Scaffolding with contigs larger than): ', param.contig_threshold
    print >> Information, 'Number of links required to create an edge: ', param.edgesupport
    print >> Information, 'Read length set to: ', param.read_len
    print >> Information, 'Relative weight of dominating link set to (default=3): ', param.rel_weight
    print >> Information, ''
    return ()
Beispiel #7
def get_contamination_metrics(largest_contigs_indexes, bam_file, cont_names, param, Information):
    iter_threshold = 1000000
    counter_total= 0
    count_contamine = 0

    contamination_reads = []
    counter = 0

    # for index in largest_contigs_indexes:
    #     try:
    #         iter_ = bam_file.fetch(cont_names[index])
    #     except ValueError:
    #         sys.stderr.write('Need indexed bamfiles, index file should be located in the same directory as the BAM file\nterminating..\n')
    #         sys.exit(0)
    for read in bam_file:
        # all reads mapping
        if read.rname in largest_contigs_indexes:
            counter += 1
            if not read.is_unmapped: ##read.tid == read.rnext and not read.mate_is_unmapped and not read.is_unmapped: #
                counter_total += 1

            # contamination reads (mapped in reverse complemented orientation)
            if  param.orientation == 'fr' and bam_parser.is_proper_aligned_unique_outie(read):
                if read.rname in largest_contigs_indexes:
                    count_contamine += 2
            if  param.orientation == 'rf' and bam_parser.is_proper_aligned_unique_innie(read):
                if read.rname in largest_contigs_indexes:
                    count_contamine += 2 
            if counter >= iter_threshold:

    ## SMOOTH OUT contamine distribution here by removing extreme observations## 
    n_contamine = float(len(contamination_reads))
    mean_isize = 0
    std_dev_isize = 0
    if n_contamine > 2:
        mean_isize = sum(contamination_reads) / n_contamine
        std_dev_isize = (sum(list(map((lambda x: x ** 2 - 2 * x * mean_isize + mean_isize ** 2), contamination_reads))) / (n_contamine - 1)) ** 0.5
        print >> Information, 'Contamine mean before filtering :', mean_isize
        print >> Information, 'Contamine stddev before filtering: ', std_dev_isize
        extreme_obs_occur = True
        while extreme_obs_occur:
            extreme_obs_occur, filtered_list = AdjustInsertsizeDist(mean_isize, std_dev_isize, contamination_reads)
            n_contamine = float(len(filtered_list))
            if n_contamine > 2:
                mean_isize = sum(filtered_list) / n_contamine
                std_dev_isize = (sum(list(map((lambda x: x ** 2 - 2 * x * mean_isize + mean_isize ** 2), filtered_list))) / (n_contamine - 1)) ** 0.5
                contamination_reads = filtered_list
        print >> Information, 'Contamine mean converged:', mean_isize
        print >> Information, 'Contamine std_est converged: ', std_dev_isize

    if mean_isize >= param.mean_ins_size or std_dev_isize >= param.std_dev_ins_size or n_contamine <= 1000:
        # either contamine mean or stddev is higher than MP lb mean which means it's spurious alignments -> no true contamine
        # or we have less than 0.1% of contamine reads, then we skip dealing with them since they introduce more complexity
        # when orienting scaffolds in pathfinder module
        param.contamination_ratio  = False      
        param.contamination_mean = 0
        param.contamination_stddev = 0
        param.contamination_mean = mean_isize
        param.contamination_stddev = std_dev_isize
        param.contamination_ratio = 2*n_contamine / float(counter_total)

    return n_contamine
Beispiel #8
def get_metrics(bam_file, param, Information):
    #informative_pair = set([147, 163]) #161,145,129,177,
    cont_names = bam_file.references
    cont_lengths = bam_file.lengths
    #cont_lengths=[int(nr) for nr in cont_lengths]  #convert long to int object
    cont_lengths_list = list(cont_lengths)
    indexes = [i for i in range(0, len(cont_lengths_list))]
    largest_contigs_indexes = set(nlargest(1000, indexes, key=lambda i: cont_lengths_list[i])) #get indexes of the 1000 longest contigs

    except ValueError:
        sys.stderr.write('Need indexed bamfiles, index file should be located in the same directory as the BAM file\nterminating..\n')
    #largest_contigs_indexes = nlargest(1000, indexes, key=lambda i: cont_lengths_list[i]) #get indexes of the 1000 longest contigs
    #print largest_contigs_indexes

    if not param.read_len: # user has not specified read len  
        #get read length
        nr_reads = 0
        tot_read_len = 0
        # for index in largest_contigs_indexes:
        #     try:
        #         iter_ = bam_file.fetch(cont_names[index])
        #     except ValueError:
        #         sys.stderr.write('Need indexed bamfiles, index file should be located in the same directory as the BAM file\nterminating..\n')
        #         sys.exit(0)

        for read in bam_file:
            if read.rlen != 0:
                tot_read_len += read.rlen
                nr_reads += 1
                tot_read_len += read.alen
                nr_reads += 1
            if nr_reads >= 100:
                param.read_len = tot_read_len / float(nr_reads)
            sys.stderr.write('Did not get sufficient readmappings to calculate\
             read_length from mappings. Got {0} mappings. Please provide this parameter or more importantly\
             check why almost no reads are mapping to the contigs.\nterminating..\n'.format(nr_reads))


    if param.mean_ins_size and param.std_dev_ins_size and not param.ins_size_threshold: # user has specified mean and std dev but no thresholds
        param.ins_size_threshold = param.mean_ins_size + 4 * param.std_dev_ins_size
        if param.extend_paths:
            param.contig_threshold = param.ins_size_threshold
            param.contig_threshold = param.mean_ins_size + (param.std_dev_ins_size / float(param.mean_ins_size)) * param.std_dev_ins_size
        print >> Information, '-T', param.ins_size_threshold, '-t', param.contig_threshold

    if not param.mean_ins_size: # user has not specified mean and std dev. (and no thresholds)
        #total_reads_iterated_through = 0
        counter = 1
        ins_size_reads = []
        # for index in largest_contigs_indexes:
        #     try:
        #         iter_ = bam_file.fetch(cont_names[index])
        #     except ValueError:
        #         sys.stderr.write('Need indexed bamfiles, index file should be located in the same directory as the BAM file\nterminating..\n')
        #         sys.exit(0)
        for read in bam_file:
            if param.orientation == 'fr' and bam_parser.is_proper_aligned_unique_innie(read):
                if read.rname in largest_contigs_indexes:
                    counter += 1
            if param.orientation == 'rf' and bam_parser.is_proper_aligned_unique_outie(read):
                if read.rname in largest_contigs_indexes:
                    ins_size_reads.append(abs(read.tlen) + 2*param.read_len)
                    counter += 1
            if counter > 1000000:
        # if counter > 1000000:
        #     break

        #get mean and std dev here. 
        #Assure that there were enough reads  for computation of mean and variance
        if len(ins_size_reads) <= 1000:
            sys.stderr.write('To few valid read alignments exists to compute mean and variance of library (need at least 1000 observations). Got only ' + str(len(ins_size_reads)) + ' valid alignments. Please specify -m and -s to the program. \nPrinting out scaffolds produced in earlier steps...')

        ## SMOOTH OUT THE MEAN HERE by removing extreme observations## 
        n = float(len(ins_size_reads))
        mean_isize = sum(ins_size_reads) / n
        std_dev_isize = (sum(list(map((lambda x: x ** 2 - 2 * x * mean_isize + mean_isize ** 2), ins_size_reads))) / (n - 1)) ** 0.5
        print >> Information, 'Mean before filtering :', mean_isize
        print >> Information, 'Std_est  before filtering: ', std_dev_isize
        extreme_obs_occur = True
        while extreme_obs_occur:
            extreme_obs_occur, filtered_list = AdjustInsertsizeDist(mean_isize, std_dev_isize, ins_size_reads)
            n = float(len(filtered_list))
            mean_isize = sum(filtered_list) / n
            std_dev_isize = (sum(list(map((lambda x: x ** 2 - 2 * x * mean_isize + mean_isize ** 2), filtered_list))) / (n - 1)) ** 0.5
            ins_size_reads = filtered_list

        n = float(len(ins_size_reads))
        mean_isize = sum(ins_size_reads) / n
        std_dev_isize = (sum(list(map((lambda x: x ** 2 - 2 * x * mean_isize + mean_isize ** 2), ins_size_reads))) / (n - 1)) ** 0.5

        print >> Information, 'Mean converged:', mean_isize
        print >> Information, 'Std_est converged: ', std_dev_isize

        param.mean_ins_size = mean_isize
        param.std_dev_ins_size = std_dev_isize
        total_reads_iterated_through = None

    if not param.ins_size_threshold:
        param.ins_size_threshold = param.mean_ins_size + 4 * param.std_dev_ins_size
        if param.extend_paths:
            param.contig_threshold = param.ins_size_threshold
            param.contig_threshold = param.mean_ins_size + (param.std_dev_ins_size / float(param.mean_ins_size)) * param.std_dev_ins_size

    ## finally, get a reverse complemented read contamination distribution from the MP library if it exists
    n_contamine = get_contamination_metrics(largest_contigs_indexes,bam_file, cont_names, param, Information)

    print >> Information, ''
    print >> Information, 'LIBRARY STATISTICS'
    print >> Information, 'Mean of library set to:', param.mean_ins_size
    print >> Information, 'Standard deviation of library set to: ', param.std_dev_ins_size
    print >> Information, 'MP library PE contamination:'
    print >> Information, 'Contamine rate (rev comp oriented) estimated to: ',  param.contamination_ratio
    print >> Information, 'lib contamine mean (avg fragmentation size): ', param.contamination_mean
    print >> Information, 'lib contamine stddev: ', param.contamination_stddev 
    print >> Information, 'Number of contamined reads used for this calculation: ', n_contamine

    print >> Information, '-T (library insert size threshold) set to: ', param.ins_size_threshold
    print >> Information, '-k set to (Scaffolding with contigs larger than): ', param.contig_threshold
    print >> Information, 'Number of links required to create an edge: ', param.edgesupport
    print >> Information, 'Read length set to: ', param.read_len
    print >> Information, 'Relative weight of dominating link set to (default=3): ', param.rel_weight
    print >> Information, ''