Example #1
0
	def sample_distribution(self):
		## pilot sample
		read_lengths = []
		# max_tlen = 0
		#bam_filtered = ifilter(lambda r: is_proper_aligned_unique_innie(r), self.bamfile)
		isize_list = []
		mcmc_dict = {}
		#nr_reads = 0
		#nr_mapped = 0
		# nr_proper_mapped = 0
		for sample_nr,read in enumerate(self.bamfile):
			## add do insert size distribution calculation if proper pair
			if is_proper_aligned_unique_innie(read) and not  read.is_reverse:
				self.param.nr_proper_mapped += 2 # add the read plus its mate since the mate does not enter here
				assert read.tlen > 0
				read_lengths.append(read.rlen)	
				isize_list.append(read.tlen)
				if read.tid in mcmc_dict:
					mcmc_dict[read.tid].append(read.tlen)
				else:
					mcmc_dict[read.tid] = [read.tlen]
				# if abs(read.tlen) > max_tlen:
				#	max_tlen = abs(read.tlen)
			if sample_nr >= SAMPLE_SIZE:
				break

		# for sample_nr,read in enumerate(bam_filtered):
	 #			## add do insert size distribution calculation if proper pair
		#	if is_proper_aligned_unique_innie(read) and not  read.is_reverse:
		#		assert read.tlen > 0
		#		read_lengths.append(read.rlen)	
		#		isize_list.append(read.tlen)
		#		# if abs(read.tlen) > max_tlen:
		#		#	max_tlen = abs(read.tlen)
		#	if sample_nr >= SAMPLE_SIZE:
		#		break

		# for read, mate_pos in fb.proper_read_isize(self.bamfile, self.param.lib_min, self.param.ligetdistr.assemblymodule.b_max):
	 #			sample_nr += 1
	 #			## add do insert size distribution calculation if proper pair
	 #			if read.tlen >= 0:
		#	#if is_proper_aligned_unique_innie(read) and read.is_read1:
		#		read_lengths.append(read.rlen)	
		#		isize_list.append(read.tlen)
		#		# if abs(read.tlen) > max_tlen:
		#		#	max_tlen = abs(read.tlen)
		#	if sample_nr >= SAMPLE_SIZE:
		#		break

		self.bamfile.reset()
		#max_tlen = max_tlen+1000
		self.read_length = sum(read_lengths)/float(len(read_lengths))

		## sample proper reads
		
		# isize_list = []
		# for sample_nr,read in enumerate(proper_read_isize_iter(self.bampath, self.read_length, max_tlen)):
	 #			isize_list.append(read)
		#	if sample_nr > SAMPLE_SIZE:
		#		break
		params = dict()
		params["sample-nr"] = sample_nr

		isize_list = filter(lambda x: 0 < x - 2*self.read_length,isize_list)

		n_isize = float(len(isize_list))
		mean_isize = sum(isize_list)/n_isize
		std_dev_isize =  (sum(list(map((lambda x: x ** 2 - 2 * x * mean_isize + mean_isize ** 2), isize_list))) / (n_isize - 1)) ** 0.5
		params["mu-raw"] = mean_isize
		params["sd-raw"] = std_dev_isize
		extreme_obs_occur = True
		while extreme_obs_occur:
			#print 'HERE!!'
			extreme_obs_occur, filtered_list = AdjustInsertsizeDist(mean_isize, std_dev_isize, isize_list)
			n_isize = float(len(filtered_list))
			mean_isize = sum(filtered_list) / n_isize
			std_dev_isize = (sum(list(map((lambda x: x ** 2 - 2 * x * mean_isize + mean_isize ** 2), filtered_list))) / (n_isize - 1)) ** 0.5
			isize_list = filtered_list

		self.min_isize, self.max_isize = min(isize_list), max(isize_list) 

		# filter outliers
		for ref in mcmc_dict.keys():
			ref_isizes = mcmc_dict[ref]
			mcmc_dict[ref] = list(filter(lambda x: self.min_isize <= x <= self.max_isize, ref_isizes))

		params["mu-filtered"] = mean_isize
		params["sd-filtered"] = std_dev_isize
		params["min-isize"] = self.min_isize
		params["max-isize"] = self.max_isize
		params["read-length"] = self.read_length

		self.nobs = n_isize
		self.mean = mean_isize
		self.stddev = std_dev_isize 
		self.full_ECDF = ECDF(isize_list)
		self.adjustedECDF_no_gap = None
		self.adjustedECDF_no_gap = self.get_correct_ECDF()
		params["mu-adjusted"] = self.adjusted_mean
		params["sd-adjusted"] = self.adjusted_stddev

		samples = min(SAMPLE_SIZE,len(isize_list))
		# ess = self.effectiveSampleSize(mcmc_dict) #isize_list[:samples]) # mcmc_dict ) #
		# self.ess_ratio = ess / float(sum(map(lambda x: len(mcmc_dict[x]), mcmc_dict)))
		params["ess"] = 1 #self.ess_ratio
		reference_lengths = map(lambda x: int(x), self.bamfile.lengths)
		ref_list = zip(self.bamfile.references, reference_lengths)
		total_basepairs = sum(reference_lengths)
		self.param.total_basepairs = total_basepairs
		params["genome-length"] = total_basepairs
		params["contigs"] = []
		for ref, length in ref_list:
			params["contigs"].append( { "name" : ref, "length" : length } )
		
		json.dump(params, self.lib_file, sort_keys=True, indent=4, separators=(',', ': '))

		params = dict()
		if self.param.nr_reads:
			reads = dict()
			reads["total"] = self.param.nr_reads
			reads["mapped"] = self.param.nr_mapped
			reads["properly-mapped"] = self.param.nr_proper_mapped
			reads["mapped-percentage"] = self.param.nr_mapped/float(self.param.nr_reads)
			reads["properly-mapped-percentage"] = self.param.nr_proper_mapped/float(self.param.nr_reads)
			reads["coverage"] = self.param.nr_reads/float(total_basepairs)
			reads["coverage-mapped"] = self.param.nr_mapped/float(total_basepairs)
			reads["coverage-properly-mapped"] = self.param.nr_proper_mapped/float(total_basepairs)
			params["reads"] = reads

		info = dict()
		info["proper-samples"] = samples
		info["ess-proper-samples"] = 1 #ess
		info["ess-ratio"] = 1 #self.ess_ratio
		coverage = self.read_length*samples*2/float(total_basepairs)
		info["mean-coverage-proper"] = coverage
		inner_span_coverage = coverage * (self.mean -2*self.read_length)/(2*self.read_length)
		info["average-theoretical-inner-span-coverage"] = inner_span_coverage
		info["mu-full-lib"] = self.mean
		info["sd-full-lib"] = self.stddev
		info["mu-empirical"] = self.adjusted_mean
		info["sd-empirical"] = self.adjusted_stddev
		mu_naive = self.mean + self.stddev**2/float(self.mean - 2*self.read_length+1)
		sigma_naive = math.sqrt(self.stddev**2 - self.stddev**4/(self.mean -2*self.read_length +1)**2 )
		info["mu-naive"] = mu_naive
		info["sd-naive"] = sigma_naive
		mu_sophisticated = param_est.mean_given_d(self.mean, self.stddev, self.read_length, total_basepairs, total_basepairs, 0)
		sigma_sophisticated = param_est.stddev_given_d(self.mean, self.stddev, self.read_length, total_basepairs, total_basepairs, 0)
		info["mu-sophisticated"] = mu_sophisticated
		info["sd-sophisticated"] = sigma_sophisticated
		theoretical_margin_of_error = NORMAL_QUANTILE_TWO_SIDED_95*self.stddev / math.sqrt(inner_span_coverage)
		info["theoretical-error-margin-two-sided-95"] = theoretical_margin_of_error
		params["extra-info"] = info
		json.dump(params, self.stats_file, sort_keys=True, indent=4, separators=(',', ': '))
		self.stats_file.close()

		if self.param.plots:
			outfile = os.path.join(self.param.plotfolder, 'isize.eps')
			plot_isize(isize_list, outfile) 
			outfile = os.path.join(self.param.plotfolder, 'fitted_params_isize.eps')
			fit.main(isize_list, outfile)
Example #2
0
    def sample_distribution(self):
        ## pilot sample
        read_lengths = []
        # max_tlen = 0
        #bam_filtered = ifilter(lambda r: is_proper_aligned_unique_innie(r), self.bamfile)
        isize_list = []
        mcmc_dict = {}
        #nr_reads = 0
        #nr_mapped = 0
        # nr_proper_mapped = 0
        for sample_nr, read in enumerate(self.bamfile):
            ## add do insert size distribution calculation if proper pair
            if is_proper_aligned_unique_innie(read) and not read.is_reverse:
                self.param.nr_proper_mapped += 2  # add the read plus its mate since the mate does not enter here
                assert read.tlen > 0
                read_lengths.append(read.rlen)
                isize_list.append(read.tlen)
                if read.tid in mcmc_dict:
                    mcmc_dict[read.tid].append(read.tlen)
                else:
                    mcmc_dict[read.tid] = [read.tlen]
                # if abs(read.tlen) > max_tlen:
                #	max_tlen = abs(read.tlen)
            if sample_nr >= SAMPLE_SIZE:
                break

        # for sample_nr,read in enumerate(bam_filtered):
        #			## add do insert size distribution calculation if proper pair
        #	if is_proper_aligned_unique_innie(read) and not  read.is_reverse:
        #		assert read.tlen > 0
        #		read_lengths.append(read.rlen)
        #		isize_list.append(read.tlen)
        #		# if abs(read.tlen) > max_tlen:
        #		#	max_tlen = abs(read.tlen)
        #	if sample_nr >= SAMPLE_SIZE:
        #		break

        # for read, mate_pos in fb.proper_read_isize(self.bamfile, self.param.lib_min, self.param.ligetdistr.assemblymodule.b_max):
        #			sample_nr += 1
        #			## add do insert size distribution calculation if proper pair
        #			if read.tlen >= 0:
        #	#if is_proper_aligned_unique_innie(read) and read.is_read1:
        #		read_lengths.append(read.rlen)
        #		isize_list.append(read.tlen)
        #		# if abs(read.tlen) > max_tlen:
        #		#	max_tlen = abs(read.tlen)
        #	if sample_nr >= SAMPLE_SIZE:
        #		break

        self.bamfile.reset()
        #max_tlen = max_tlen+1000
        self.read_length = sum(read_lengths) / float(len(read_lengths))

        ## sample proper reads

        # isize_list = []
        # for sample_nr,read in enumerate(proper_read_isize_iter(self.bampath, self.read_length, max_tlen)):
        #			isize_list.append(read)
        #	if sample_nr > SAMPLE_SIZE:
        #		break
        params = dict()
        params["sample-nr"] = sample_nr

        isize_list = filter(lambda x: 0 < x - 2 * self.read_length, isize_list)

        n_isize = float(len(isize_list))
        mean_isize = sum(isize_list) / n_isize
        std_dev_isize = (sum(
            list(
                map((lambda x: x**2 - 2 * x * mean_isize + mean_isize**2),
                    isize_list))) / (n_isize - 1))**0.5
        params["mu-raw"] = mean_isize
        params["sd-raw"] = std_dev_isize
        extreme_obs_occur = True
        while extreme_obs_occur:
            #print 'HERE!!'
            extreme_obs_occur, filtered_list = AdjustInsertsizeDist(
                mean_isize, std_dev_isize, isize_list)
            n_isize = float(len(filtered_list))
            mean_isize = sum(filtered_list) / n_isize
            std_dev_isize = (sum(
                list(
                    map((lambda x: x**2 - 2 * x * mean_isize + mean_isize**2),
                        filtered_list))) / (n_isize - 1))**0.5
            isize_list = filtered_list

        self.min_isize, self.max_isize = min(isize_list), max(isize_list)

        # filter outliers
        for ref in mcmc_dict.keys():
            ref_isizes = mcmc_dict[ref]
            mcmc_dict[ref] = list(
                filter(lambda x: self.min_isize <= x <= self.max_isize,
                       ref_isizes))

        params["mu-filtered"] = mean_isize
        params["sd-filtered"] = std_dev_isize
        params["min-isize"] = self.min_isize
        params["max-isize"] = self.max_isize
        params["read-length"] = self.read_length

        self.nobs = n_isize
        self.mean = mean_isize
        self.stddev = std_dev_isize
        self.full_ECDF = ECDF(isize_list)
        self.adjustedECDF_no_gap = None
        self.adjustedECDF_no_gap = self.get_correct_ECDF()
        params["mu-adjusted"] = self.adjusted_mean
        params["sd-adjusted"] = self.adjusted_stddev

        samples = min(SAMPLE_SIZE, len(isize_list))
        # ess = self.effectiveSampleSize(mcmc_dict) #isize_list[:samples]) # mcmc_dict ) #
        # self.ess_ratio = ess / float(sum(map(lambda x: len(mcmc_dict[x]), mcmc_dict)))
        params["ess"] = 1  #self.ess_ratio
        reference_lengths = map(lambda x: int(x), self.bamfile.lengths)
        ref_list = zip(self.bamfile.references, reference_lengths)
        total_basepairs = sum(reference_lengths)
        self.param.total_basepairs = total_basepairs
        params["genome-length"] = total_basepairs
        params["contigs"] = []
        for ref, length in ref_list:
            params["contigs"].append({"name": ref, "length": length})

        json.dump(params,
                  self.lib_file,
                  sort_keys=True,
                  indent=4,
                  separators=(',', ': '))

        params = dict()
        if self.param.nr_reads:
            reads = dict()
            reads["total"] = self.param.nr_reads
            reads["mapped"] = self.param.nr_mapped
            reads["properly-mapped"] = self.param.nr_proper_mapped
            reads["mapped-percentage"] = self.param.nr_mapped / float(
                self.param.nr_reads)
            reads[
                "properly-mapped-percentage"] = self.param.nr_proper_mapped / float(
                    self.param.nr_reads)
            reads["coverage"] = self.param.nr_reads / float(total_basepairs)
            reads["coverage-mapped"] = self.param.nr_mapped / float(
                total_basepairs)
            reads[
                "coverage-properly-mapped"] = self.param.nr_proper_mapped / float(
                    total_basepairs)
            params["reads"] = reads

        info = dict()
        info["proper-samples"] = samples
        info["ess-proper-samples"] = 1  #ess
        info["ess-ratio"] = 1  #self.ess_ratio
        coverage = self.read_length * samples * 2 / float(total_basepairs)
        info["mean-coverage-proper"] = coverage
        inner_span_coverage = coverage * (self.mean - 2 * self.read_length) / (
            2 * self.read_length)
        info["average-theoretical-inner-span-coverage"] = inner_span_coverage
        info["mu-full-lib"] = self.mean
        info["sd-full-lib"] = self.stddev
        info["mu-empirical"] = self.adjusted_mean
        info["sd-empirical"] = self.adjusted_stddev
        mu_naive = self.mean + self.stddev**2 / float(self.mean -
                                                      2 * self.read_length + 1)
        sigma_naive = math.sqrt(self.stddev**2 - self.stddev**4 /
                                (self.mean - 2 * self.read_length + 1)**2)
        info["mu-naive"] = mu_naive
        info["sd-naive"] = sigma_naive
        mu_sophisticated = param_est.mean_given_d(self.mean, self.stddev,
                                                  self.read_length,
                                                  total_basepairs,
                                                  total_basepairs, 0)
        sigma_sophisticated = param_est.stddev_given_d(self.mean, self.stddev,
                                                       self.read_length,
                                                       total_basepairs,
                                                       total_basepairs, 0)
        info["mu-sophisticated"] = mu_sophisticated
        info["sd-sophisticated"] = sigma_sophisticated
        theoretical_margin_of_error = NORMAL_QUANTILE_TWO_SIDED_95 * self.stddev / math.sqrt(
            inner_span_coverage)
        info[
            "theoretical-error-margin-two-sided-95"] = theoretical_margin_of_error
        params["extra-info"] = info
        json.dump(params,
                  self.stats_file,
                  sort_keys=True,
                  indent=4,
                  separators=(',', ': '))
        self.stats_file.close()

        if self.param.plots:
            outfile = os.path.join(self.param.plotfolder, 'isize.eps')
            plot_isize(isize_list, outfile)
            outfile = os.path.join(self.param.plotfolder,
                                   'fitted_params_isize.eps')
            fit.main(isize_list, outfile)
Example #3
0
def plot_bp_specific_distr(infile, param):
	means = {}
	stddevs = {}
	for i in [2, 51, 201,501]:
		means[i]=[]
		stddevs[i] = []

	avg_mean = 0
	avg_stddev = 0
	avg_spancov = 0
	tot_pos = 0

	for line in infile:
		[ref,pos, n_obs,mean,sigma] = line.strip().split()
		n_obs = int(float(n_obs))
		mean = float(mean)
		sigma = float(sigma)
		
		if n_obs > 2:
			avg_mean += mean
			avg_stddev += sigma
			avg_spancov += n_obs
			tot_pos += 1

		if 2 < n_obs <= 50:
			means[2].append(mean)
			stddevs[2].append(sigma)

		elif 51 < n_obs <= 200:
			means[51].append(mean)
			stddevs[51].append(sigma)

		elif 201 < n_obs <= 500:
			means[201].append(mean)
			stddevs[201].append(sigma)

		elif 501 < n_obs:
			means[501].append(mean)
			stddevs[501].append(sigma)

	# print len(m_1), len(m_2), len(m_3),len(m_4)
	avg_mean = avg_mean / float(tot_pos)
	avg_stddev = avg_stddev / float(tot_pos)
	avg_spancov = avg_spancov /float(tot_pos)
	print avg_mean,avg_stddev, avg_spancov

	nr_obs, mu = zip(*filter(lambda x: means[x[0]] , means.iteritems()))
	nr_obs, sigma = zip(*filter(lambda x: stddevs[x[0]] , stddevs.iteritems()))
	#nr_obs = list(nr_obs)
	#nr_obs.sort()
	labels = []
	for low in nr_obs:
		labels.append(">{0} obs".format(low))
	#labels.append(">{0} obs".format(high))
	plt.hist(mu, stacked=True, bins=100, log=True, label=labels)
	plt.ylabel('Frequency (log scale)')
	plt.xlabel('isize mean of mates spanning over position')
	title = "Bp specific mean insert size (avg. over genome = %.2f)" % (avg_mean)
	plt.title(title)
	plt.legend( )
	out = os.path.join(param.plotfolder, 'bp_specific_mean.eps')
	plt.savefig(out)
	plt.close()

	plt.hist(sigma, stacked=True, bins=100, log=True, label=labels)
	plt.ylabel('Frequency (log scale)')
	plt.xlabel('isize standard deviation of mates spanning over position')
	title  = "Bp specific stddev of insert size (avg. over genome = %.2f)" % (avg_stddev)
	plt.title(title)
	plt.legend( )
	out = os.path.join(param.plotfolder, 'bp_specific_stddev.eps')
	plt.savefig(out)
	plt.savefig(out)
	plt.close()
	stddevs = {}

	out = os.path.join(param.plotfolder, 'fitted_params_avg_span.eps')
	bp_list= []
	for key in means:
		for mean in means[key]:
			bp_list.append(mean) 
	fit.main(bp_list, out)
Example #4
0
def plot_bp_specific_distr(infile, param):
    means = {}
    stddevs = {}
    for i in [2, 51, 201, 501]:
        means[i] = []
        stddevs[i] = []

    avg_mean = 0
    avg_stddev = 0
    avg_spancov = 0
    tot_pos = 0

    for line in infile:
        [ref, pos, n_obs, mean, sigma] = line.strip().split()
        n_obs = int(float(n_obs))
        mean = float(mean)
        sigma = float(sigma)

        if n_obs > 2:
            avg_mean += mean
            avg_stddev += sigma
            avg_spancov += n_obs
            tot_pos += 1

        if 2 < n_obs <= 50:
            means[2].append(mean)
            stddevs[2].append(sigma)

        elif 51 < n_obs <= 200:
            means[51].append(mean)
            stddevs[51].append(sigma)

        elif 201 < n_obs <= 500:
            means[201].append(mean)
            stddevs[201].append(sigma)

        elif 501 < n_obs:
            means[501].append(mean)
            stddevs[501].append(sigma)

    # print len(m_1), len(m_2), len(m_3),len(m_4)
    avg_mean = avg_mean / float(tot_pos)
    avg_stddev = avg_stddev / float(tot_pos)
    avg_spancov = avg_spancov / float(tot_pos)
    print avg_mean, avg_stddev, avg_spancov

    nr_obs, mu = zip(*filter(lambda x: means[x[0]], means.iteritems()))
    nr_obs, sigma = zip(*filter(lambda x: stddevs[x[0]], stddevs.iteritems()))
    #nr_obs = list(nr_obs)
    #nr_obs.sort()
    labels = []
    for low in nr_obs:
        labels.append(">{0} obs".format(low))
    #labels.append(">{0} obs".format(high))
    plt.hist(mu, stacked=True, bins=100, log=True, label=labels)
    plt.ylabel('Frequency (log scale)')
    plt.xlabel('isize mean of mates spanning over position')
    title = "Bp specific mean insert size (avg. over genome = %.2f)" % (
        avg_mean)
    plt.title(title)
    plt.legend()
    out = os.path.join(param.plotfolder, 'bp_specific_mean.eps')
    plt.savefig(out)
    plt.close()

    plt.hist(sigma, stacked=True, bins=100, log=True, label=labels)
    plt.ylabel('Frequency (log scale)')
    plt.xlabel('isize standard deviation of mates spanning over position')
    title = "Bp specific stddev of insert size (avg. over genome = %.2f)" % (
        avg_stddev)
    plt.title(title)
    plt.legend()
    out = os.path.join(param.plotfolder, 'bp_specific_stddev.eps')
    plt.savefig(out)
    plt.savefig(out)
    plt.close()
    stddevs = {}

    out = os.path.join(param.plotfolder, 'fitted_params_avg_span.eps')
    bp_list = []
    for key in means:
        for mean in means[key]:
            bp_list.append(mean)
    fit.main(bp_list, out)