Esempio n. 1
0
def main():
	usage="%prog [options]" + '\n' + __doc__ + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input",action="store",type="string",dest="input_files",help='Input file(s) in BAM format. "-i" takes these input: 1) a single BAM file. 2) "," separated BAM files. 3) directory containing one or more bam files. 4) plain text file containing the path of one or more bam file (Each row is a BAM file path). All BAM files should be sorted and indexed using samtools. [required]')
	parser.add_option("-r","--refgene",action="store",type="string",dest="ref_gene_model",help="Reference gene model in bed format. [required]")
	(options,args)=parser.parse_args()
	
	if not (options.input_files and options.ref_gene_model):
		parser.print_help()
		sys.exit(0)

	if not os.path.exists(options.ref_gene_model):
		print >>sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n'
		#parser.print_help()
		sys.exit(0)
	
	printlog("Read BED file (reference gene model) ...")
	gene_percentiles = genebody_percentile(refbed = options.ref_gene_model)
		
	printlog("Get BAM file(s) ...")
	bamfiles = sorted(getBamFiles.get_bam_files(options.input_files))
	
	print >>sys.stderr, "Total %d BAM files:" % len(bamfiles)
	for f in bamfiles:
		print >>sys.stderr, "\t" + f
	
	print "\t".join(['Bam_file','TIN(mean)', 'TIN(median)','TIN(stdev)'])
	for f in bamfiles:
		OUT = open(os.path.basename(f).replace('bam','') + 'tin.xls','w')
		print >>OUT, "\t".join(["geneID","chrom", "tx_start", "tx_end","entropy","TIN"])
		tin_values=[]
		samfile = pysam.Samfile(f, "rb")
		
		for gname, i_chr, i_tx_start, i_tx_end, positions in genebody_percentile(options.ref_gene_model):
			if len(positions) == 0:
				entropy = "NA"
				entropy_bar = "NA"
				tin = "NA"
			
			coverage = genebody_coverage(samfile, i_chr,positions)
			entropy = mystat.shannon_entropy(coverage)
			if entropy == "NA":
				tin = "NA"
			else:
				tin = 10*(math.exp(entropy) / len(coverage))
				tin_values.append(tin)
			print >>OUT, '\t'.join([str(i) for i in (gname, i_chr, i_tx_start, i_tx_end, entropy, tin)])
		OUT.close()
		print "\t".join( [str(i) for i in (os.path.basename(f), mean(tin_values), median(tin_values), std(tin_values))])
Esempio n. 2
0
def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option(
        "-i",
        "--input",
        action="store",
        type="string",
        dest="input_files",
        help=
        'Input file(s) in BAM format. "-i" takes these input: 1) a single BAM file. 2) "," separated BAM files. 3) directory containing one or more bam files. 4) plain text file containing the path of one or more bam file (Each row is a BAM file path). All BAM files should be sorted and indexed using samtools.'
    )
    parser.add_option("-r",
                      "--refgene",
                      action="store",
                      type="string",
                      dest="ref_gene_model",
                      help="Reference gene model in bed format. [required]")
    parser.add_option(
        "-l",
        "--minimum_length",
        action="store",
        type="int",
        default=100,
        dest="min_mRNA_length",
        help=
        "Minimum mRNA length (bp). mRNA smaller than \"min_mRNA_length\" will be skipped. default=%default"
    )
    parser.add_option(
        "-f",
        "--format",
        action="store",
        type="string",
        dest="output_format",
        default='pdf',
        help="Output file format, 'pdf', 'png' or 'jpeg'. default=%default")
    parser.add_option("-o",
                      "--out-prefix",
                      action="store",
                      type="string",
                      dest="output_prefix",
                      help="Prefix of output files(s). [required]")
    (options, args) = parser.parse_args()

    if not (options.output_prefix and options.input_files
            and options.ref_gene_model):
        parser.print_help()
        sys.exit(0)

    if not os.path.exists(options.ref_gene_model):
        print >> sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n'
        #parser.print_help()
        sys.exit(0)
    if options.min_mRNA_length < 100:
        print >> sys.stderr, 'The number specified to "-l" cannot be smaller than 100.' + '\n'
        sys.exit(0)

    OUT1 = open(options.output_prefix + ".geneBodyCoverage.txt", 'w')
    print >> OUT1, "Percentile\t" + '\t'.join([str(i) for i in range(1, 101)])

    printlog("Read BED file (reference gene model) ...")
    gene_percentiles = genebody_percentile(
        refbed=options.ref_gene_model, mRNA_len_cut=options.min_mRNA_length)

    printlog("Get BAM file(s) ...")
    bamfiles = getBamFiles.get_bam_files(options.input_files)
    for f in bamfiles:
        print >> sys.stderr, "\t" + f

    file_container = []
    for bamfile in bamfiles:
        printlog("Processing " + basename(bamfile) + ' ...')
        cvg = genebody_coverage(bamfile, gene_percentiles)
        if len(cvg) == 0:
            print >> sys.stderr, "\nCannot get coverage signal from " + basename(
                bamfile) + ' ! Skip'
            continue
        tmp = valid_name(basename(bamfile).replace(
            '.bam', ''))  # scrutinize R identifer
        if file_container.count(tmp) == 0:
            print >> OUT1, tmp + '\t' + '\t'.join(
                [str(cvg[k]) for k in sorted(cvg)])
        else:
            print >> OUT1, tmp + '.' + str(
                file_container.count(tmp)) + '\t' + '\t'.join(
                    [str(cvg[k]) for k in sorted(cvg)])
        file_container.append(tmp)
    OUT1.close()

    dataset = []
    for line in open(options.output_prefix + ".geneBodyCoverage.txt", 'r'):
        line = line.strip()
        if line.startswith("Percentile"):
            continue
        f = line.split()
        name = f[0]
        dat = [float(i) for i in f[1:]]
        skewness = pearson_moment_coefficient(dat)
        dataset.append((name, [(i - min(dat)) / (max(dat) - min(dat))
                               for i in dat], skewness))
    dataset.sort(key=operator.itemgetter(2), reverse=True)

    print >> sys.stderr, "\n\n"
    print >> sys.stderr, "\tSample\tSkewness"
    for a, b, c in dataset:
        print >> sys.stderr, '\t' + a + '\t' + str(c)
    Rcode_write(dataset,
                options.output_prefix + '.geneBodyCoverage',
                format=options.output_format)

    printlog("Running R script ...")
    try:
        subprocess.call("Rscript " + options.output_prefix +
                        '.geneBodyCoverage.r',
                        shell=True)
    except:
        print >> sys.stderr, "Cannot generate pdf file from " + options.output_prefix + '.geneBodyCoverage.r'
        pass
Esempio n. 3
0
def main():
    usage = "%prog [options]" + '\n' + __doc__ + "\n"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option(
        "-i",
        "--input",
        action="store",
        type="string",
        dest="input_files",
        help=
        'Input BAM file(s). "-i" takes these input: 1) a single BAM file. 2) "," separated BAM files (no spaces allowed). 3) directory containing one or more bam files. 4) plain text file containing the path of one or more bam files (Each row is a BAM file path). All BAM files should be sorted and indexed using samtools. [required]'
    )
    parser.add_option(
        "-r",
        "--refgene",
        action="store",
        type="string",
        dest="ref_gene_model",
        help=
        "Reference gene model in BED format. Must be strandard 12-column BED file. [required]"
    )
    parser.add_option(
        "-c",
        "--minCov",
        action="store",
        type="int",
        dest="minimum_coverage",
        default=10,
        help="Minimum number of read mapped to a transcript. default=%default")
    parser.add_option(
        "-n",
        "--sample-size",
        action="store",
        type="int",
        dest="sample_size",
        default=100,
        help=
        "Number of equal-spaced nucleotide positions picked from mRNA. Note: if this number is larger than the length of mRNA (L), it will be halved until it's smaller than L. default=%default"
    )
    parser.add_option(
        "-s",
        "--subtract-background",
        action="store_true",
        dest="subtract_bg",
        help=
        "Subtract background noise (estimated from intronic reads). Only use this option if there are substantial intronic reads."
    )
    (options, args) = parser.parse_args()

    # if '-s' was set
    if options.subtract_bg:
        exon_ranges = union_exons(options.ref_gene_model)

    if options.sample_size < 0:
        print >> sys.stderr, "Number of nucleotide can't be negative"
        sys.exit(0)
    elif options.sample_size > 1000:
        print >> sys.stderr, "Warning: '-n' is too large! Please try smaller '-n' valeu if program is running slow."

    if not (options.input_files and options.ref_gene_model):
        parser.print_help()
        sys.exit(0)

    if not os.path.exists(options.ref_gene_model):
        print >> sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n'
        parser.print_help()
        sys.exit(0)

    printlog("Get BAM file(s) ...")
    bamfiles = sorted(getBamFiles.get_bam_files(options.input_files))

    if len(bamfiles) <= 0:
        print >> sys.stderr, "No BAM file found, exit."
        sys.exit(0)
    else:
        print >> sys.stderr, "Total %d BAM file(s):" % len(bamfiles)
        for f in bamfiles:
            print >> sys.stderr, "\t" + f

    for f in bamfiles:
        printlog("Processing " + f)

        SUM = open(os.path.basename(f).replace('bam', '') + 'summary.txt', 'w')
        print >> SUM, "\t".join(
            ['Bam_file', 'TIN(mean)', 'TIN(median)', 'TIN(stdev)'])

        OUT = open(os.path.basename(f).replace('bam', '') + 'tin.xls', 'w')
        print >> OUT, "\t".join(
            ["geneID", "chrom", "tx_start", "tx_end", "TIN"])

        samfile = pysam.Samfile(f, "rb")
        sample_TINs = []  #sample level TIN, values are from different genes
        finish = 0
        noise_level = 0.0
        for gname, i_chr, i_tx_start, i_tx_end, intron_size, pick_positions in genomic_positions(
                refbed=options.ref_gene_model,
                sample_size=options.sample_size):
            finish += 1

            # check minimum reads coverage
            if check_min_reads(samfile, i_chr, i_tx_start, i_tx_end,
                               options.minimum_coverage) is not True:
                print >> OUT, '\t'.join([
                    str(i) for i in (gname, i_chr, i_tx_start, i_tx_end, 0.0)
                ])
                continue

            # estimate background noise if '-s' was specified
            if options.subtract_bg:
                intron_signals = estimate_bg_noise(i_chr, i_tx_start, i_tx_end,
                                                   samfile, exon_ranges)
                if intron_size > 0:
                    noise_level = intron_signals / intron_size

            coverage = genebody_coverage(samfile, i_chr,
                                         sorted(pick_positions), noise_level)

            #for a,b in zip(sorted(pick_positions),coverage):
            #	print str(a) + '\t' + str(b)

            tin1 = tin_score(cvg=coverage, l=len(pick_positions))
            sample_TINs.append(tin1)
            print >> OUT, '\t'.join(
                [str(i) for i in (gname, i_chr, i_tx_start, i_tx_end, tin1)])
            print >> sys.stderr, " %d transcripts finished\r" % (finish),

        print >> SUM, "\t".join([
            str(i) for i in (os.path.basename(f), mean(sample_TINs),
                             median(sample_TINs), std(sample_TINs))
        ])
        OUT.close()
        SUM.close()
        samfile.close()
def main():
	usage="%prog [options]" + '\n' + __doc__ + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input",action="store",type="string",dest="input_files",help='Input file(s) in BAM format. "-i" takes these input: 1) a single BAM file. 2) "," separated BAM files. 3) directory containing one or more bam files. 4) plain text file containing the path of one or more bam file (Each row is a BAM file path). All BAM files should be sorted and indexed using samtools.')
	parser.add_option("-r","--refgene",action="store",type="string",dest="ref_gene_model",help="Reference gene model in bed format. [required]")
	parser.add_option("-l","--minimum_length",action="store",type="int",default=100, dest="min_mRNA_length",help="Minimum mRNA length (bp). mRNA smaller than \"min_mRNA_length\" will be skipped. default=%default")
	parser.add_option("-f","--format",action="store",type="string",dest="output_format", default='pdf', help="Output file format, 'pdf', 'png' or 'jpeg'. default=%default")
	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s). [required]")
	(options,args)=parser.parse_args()

	if not (options.output_prefix and options.input_files and options.ref_gene_model):
		parser.print_help()
		sys.exit(0)

	if not os.path.exists(options.ref_gene_model):
		print >>sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n'
		#parser.print_help()
		sys.exit(0)
	if options.min_mRNA_length < 100:
		print >>sys.stderr, 'The number specified to "-l" cannot be smaller than 100.' + '\n'
		sys.exit(0)
		
	OUT1 = open(options.output_prefix  + ".geneBodyCoverage.txt"	,'w')
	print >>OUT1, "Percentile\t" + '\t'.join([str(i) for i in range(1,101)])
		
	printlog("Read BED file (reference gene model) ...")
	gene_percentiles = genebody_percentile(refbed = options.ref_gene_model, mRNA_len_cut = options.min_mRNA_length)
		
	printlog("Get BAM file(s) ...")
	bamfiles = getBamFiles.get_bam_files(options.input_files)
	for f in bamfiles:
		print >>sys.stderr, "\t" + f
	
	file_container = []
	for bamfile in bamfiles:
		printlog("Processing " + basename(bamfile) + ' ...')
		cvg = genebody_coverage(bamfile, gene_percentiles)
		if len(cvg) == 0:
			print >>sys.stderr, "\nCannot get coverage signal from " + basename(bamfile) + ' ! Skip'
			continue
		tmp = valid_name(basename(bamfile).replace('.bam',''))	# scrutinize R identifer
		if file_container.count(tmp) == 0:
			print >>OUT1, tmp + '\t' + '\t'.join([str(cvg[k]) for k in sorted(cvg)])
		else:
			print >>OUT1, tmp + '.' + str(file_container.count(tmp)) + '\t' + '\t'.join([str(cvg[k]) for k in sorted(cvg)])
		file_container.append(tmp)
	OUT1.close()
	
	
	dataset=[]
	for line in open(options.output_prefix  + ".geneBodyCoverage.txt",'r'):
		line = line.strip()
		if line.startswith("Percentile"):
			continue
		f = line.split()
		name = f[0]
		dat = [float(i) for i in  f[1:]]
		skewness = pearson_moment_coefficient(dat)
		dataset.append((name, [(i -min(dat))/(max(dat) - min(dat)) for i in dat], skewness))	
	dataset.sort(key = operator.itemgetter(2), reverse=True)
	
	print >>sys.stderr, "\n\n"
	print >>sys.stderr, "\tSample\tSkewness"
	for a,b,c in dataset:
		print >>sys.stderr, '\t' + a + '\t' + str(c)
	Rcode_write(dataset, options.output_prefix + '.geneBodyCoverage', format = options.output_format)
	
	printlog("Running R script ...")
	try:
		subprocess.call("Rscript " + options.output_prefix + '.geneBodyCoverage.r',shell=True)
	except:
		print >>sys.stderr, "Cannot generate pdf file from " + options.output_prefix + '.geneBodyCoverage.r'
		pass
Esempio n. 5
0
def main():
	usage="%prog [options]" + '\n' + __doc__ + "\n"
	parser = OptionParser(usage,version="%prog " + __version__)
	parser.add_option("-i","--input",action="store",type="string",dest="input_files",help='Input BAM file(s). "-i" takes these input: 1) a single BAM file. 2) "," separated BAM files (no spaces allowed). 3) directory containing one or more bam files. 4) plain text file containing the path of one or more bam files (Each row is a BAM file path). All BAM files should be sorted and indexed using samtools. [required]')
	parser.add_option("-r","--refgene",action="store",type="string",dest="ref_gene_model",help="Reference gene model in BED format. Must be strandard 12-column BED file. [required]")
	parser.add_option("-c","--minCov",action="store",type="int",dest="minimum_coverage",default=10,help="Minimum number of read mapped to a transcript. default=%default")
	parser.add_option("-n","--sample-size",action="store",type="int",dest="sample_size",default=100,help="Number of equal-spaced nucleotide positions picked from mRNA. Note: if this number is larger than the length of mRNA (L), it will be halved until it's smaller than L. default=%default")
	parser.add_option("-s","--subtract-background",action="store_true",dest="subtract_bg",help="Subtract background noise (estimated from intronic reads). Only use this option if there are substantial intronic reads.")
	(options,args)=parser.parse_args()
	
	# if '-s' was set
	if options.subtract_bg:
		exon_ranges = union_exons(options.ref_gene_model)
		
	if options.sample_size < 0:
		print >>sys.stderr, "Number of nucleotide can't be negative"
		sys.exit(0)
	elif options.sample_size >1000:
		print >>sys.stderr, "Warning: '-n' is too large! Please try smaller '-n' valeu if program is running slow."
		
	if not (options.input_files and options.ref_gene_model):
		parser.print_help()
		sys.exit(0)

	if not os.path.exists(options.ref_gene_model):
		print >>sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n'
		parser.print_help()
		sys.exit(0)
		
	printlog("Get BAM file(s) ...")
	bamfiles = sorted(getBamFiles.get_bam_files(options.input_files))
	
	if len(bamfiles) <= 0:
		print >>sys.stderr, "No BAM file found, exit."
		sys.exit(0)
	else:
		print >>sys.stderr, "Total %d BAM file(s):" % len(bamfiles)
		for f in bamfiles:
			print >>sys.stderr, "\t" + f	
	
	
	for f in bamfiles:
		printlog("Processing " + f)
		
		SUM = open(os.path.basename(f).replace('bam','') + 'summary.txt','w')
		print >>SUM, "\t".join(['Bam_file','TIN(mean)', 'TIN(median)','TIN(stdev)'])
		
		OUT = open(os.path.basename(f).replace('bam','') + 'tin.xls','w')
		print >>OUT, "\t".join(["geneID","chrom", "tx_start", "tx_end","TIN"])
		
		samfile = pysam.Samfile(f, "rb")
		sample_TINs = []	#sample level TIN, values are from different genes
		finish = 0
		noise_level = 0.0
		for gname, i_chr, i_tx_start, i_tx_end, intron_size, pick_positions in genomic_positions(refbed = options.ref_gene_model, sample_size = options.sample_size):	
			finish += 1
			
			# check minimum reads coverage
			if check_min_reads(samfile,i_chr,i_tx_start,i_tx_end,options.minimum_coverage) is not True:
				print >>OUT, '\t'.join([str(i) for i in (gname, i_chr, i_tx_start, i_tx_end, 0.0)])
				continue
				
			# estimate background noise if '-s' was specified
			if options.subtract_bg:
				intron_signals = estimate_bg_noise(i_chr, i_tx_start, i_tx_end, samfile, exon_ranges)
				if intron_size > 0:
					noise_level = intron_signals/intron_size					

			coverage = genebody_coverage(samfile, i_chr,sorted(pick_positions), noise_level)
			
			#for a,b in zip(sorted(pick_positions),coverage):
			#	print str(a) + '\t' + str(b)
			
			tin1 = tin_score(cvg = coverage, l = len(pick_positions))
			sample_TINs.append(tin1)
			print >>OUT, '\t'.join([str(i) for i in (gname, i_chr, i_tx_start, i_tx_end, tin1)])
			print >>sys.stderr, " %d transcripts finished\r" % (finish),
		
		print >>SUM, "\t".join( [str(i) for i in (os.path.basename(f), mean(sample_TINs), median(sample_TINs), std(sample_TINs))])
		OUT.close()
		SUM.close()
		samfile.close()
Esempio n. 6
0
    help='Output xls files reporting the rank of transcript TIN across samples.'
)
(options, args) = parser.parse_args()

if not (options.input_files and options.ref_bed):
    parser.print_help()
    sys.exit(0)

if not os.path.exists(options.ref_bed):
    print >> sys.stderr, '\n\n' + options.ref_bed + ' does NOT exists' + '\n'
    parser.print_help()
    sys.exit(0)

#Read BAM files.
printlog('Get BAM file ...')
bamfiles = getBamFiles.get_bam_files(options.input_files)
transcript_number, exon_number = count_number(options.ref_bed)
bamfile_number = len(bamfiles)

if bamfile_number <= 0:
    print >> sys.stderr, 'No BAM file found, exit.'
    sys.exit(0)
else:
    print >> sys.stderr, 'Total %d BAM file(s):' % len(bamfiles)

#Matrix of ks and tin of all transcripts in all samples.
ks_array = np.zeros([bamfile_number, transcript_number, 3])
tin_array = np.zeros([bamfile_number, transcript_number, 3])
#Matrix of expression level of all transcripts in all samples.
#level=0: no expression. level=1: low expression. level=2: high expression.
expression_level = np.zeros([bamfile_number, transcript_number])