Beispiel #1
0
def CF_plot(args):
	try:
		import locale
		import matplotlib
		matplotlib.use('Agg')
		import matplotlib.pyplot as plt
		import pylab as P
		from matplotlib.lines import Line2D
		from matplotlib.patches import Rectangle
		_ = locale.setlocale(locale.LC_ALL, '')
	except:
		print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
		sys.exit(0)
		
		
	chr, start, stop = cf.parseLocString(args.region)
	
	r = cf.rpkm_reader(args.input)
	
	data = r.getExonValuesByRegion(chr,start,stop)
	_ = data.smooth()
	
	plt.gcf().clear()
	fig = plt.figure(figsize=(10,5))
	ax = fig.add_subplot(111)
	
	
	ax.plot(data.rpkm, linewidth = 0.3, c='k')
	
	
	if args.sample != 'none':
		cnt = 1
		coloriter = iter(['r','b','g','y'])
		for sample in args.sample:
			try:
				color, sampleID = sample.split(":")
			except:
				color =coloriter.next()
				sampleID = sample
			
			ax.plot(data.getSample([sampleID]), linewidth = 1, c=color, label = sampleID)
			
			if cnt == 1:
				cf.plotRawData(ax, r.getExonValuesByRegion(chr,start,stop,sampleList=[sampleID]).getSample([sampleID]),color=color)
			cnt +=1
		plt.legend(prop={'size':10},frameon=False)
		
	cf.plotGenes(ax, data)
	cf.plotGenomicCoords(plt,data)
	plt.xlim(0,data.shape[1])
	plt.ylim(-3,3)
	
	plt.title("%s: %s - %s" % (cf.chrInt2Str(chr),locale.format("%d",start, grouping=True),locale.format("%d",stop, grouping=True)))
	plt.xlabel("Position")
	plt.ylabel("SVD-ZRPKM Values")
	
	plt.savefig(args.output)
	
	sys.exit(0)
Beispiel #2
0
def CF_plot(args):
	try:
		import locale
		import matplotlib
		matplotlib.use('Agg')
		import matplotlib.pyplot as plt
		import pylab as P
		from matplotlib.lines import Line2D
		from matplotlib.patches import Rectangle
		_ = locale.setlocale(locale.LC_ALL, '')
	except:
		print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
		sys.exit(0)
		
		
	chr, start, stop = cf.parseLocString(args.region)
	
	r = cf.rpkm_reader(args.input)
	
	data = r.getExonValuesByRegion(chr,start,stop)
	_ = data.smooth()
	
	plt.gcf().clear()
	fig = plt.figure(figsize=(10,5))
	ax = fig.add_subplot(111)
	
	
	ax.plot(data.rpkm, linewidth = 0.3, c='k')
	
	
	if args.sample != 'none':
		cnt = 1
		coloriter = iter(['r','b','g','y'])
		for sample in args.sample:
			try:
				color, sampleID = sample.split(":")
			except:
				color =coloriter.next()
				sampleID = sample
			
			ax.plot(data.getSample([sampleID]), linewidth = 1, c=color, label = sampleID)
			
			if cnt == 1:
				cf.plotRawData(ax, r.getExonValuesByRegion(chr,start,stop,sampleList=[sampleID]).getSample([sampleID]),color=color)
			cnt +=1
		plt.legend(prop={'size':10},frameon=False)
		
	cf.plotGenes(ax, data)
	cf.plotGenomicCoords(plt,data)
	plt.xlim(0,data.shape[1])
	plt.ylim(-3,3)
	
	plt.title("%s: %s - %s" % (cf.chrInt2Str(chr),locale.format("%d",start, grouping=True),locale.format("%d",stop, grouping=True)))
	plt.xlabel("Position")
	plt.ylabel("SVD-ZRPKM Values")
	
	plt.savefig(args.output)
	
	sys.exit(0)
Beispiel #3
0
def CF_plotcalls(args):
    try:
        import matplotlib
        matplotlib.use('Agg')
        import matplotlib.pyplot as plt
        import pylab as P
        from matplotlib.lines import Line2D
        from matplotlib.patches import Rectangle
    except:
        print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
        sys.exit(0)

    import locale
    try:
        _ = locale.setlocale(locale.LC_ALL, 'en_US')
    except:
        _ = locale.setlocale(locale.LC_ALL, '')

    try:
        callfile_fn = str(args.calls)
        callfile_f = open(callfile_fn, mode='r')
    except IOError as e:
        print '[ERROR] Cannot open call file for reading: ', callfile_fn
        sys.exit(0)

    all_calls = []
    header = callfile_f.readline()

    for line in callfile_f:
        sampleID, chr, start, stop, state = line.strip().split()
        chr = cf.chrStr2Int(chr)
        all_calls.append({
            "chromosome": int(chr),
            "start": int(start),
            "stop": int(stop),
            "sampleID": sampleID
        })

    r = cf.rpkm_reader(args.input)

    for call in all_calls:
        chr = call["chromosome"]
        start = call["start"]
        stop = call["stop"]
        sampleID = call["sampleID"]

        exons = r.getExonIDs(chr, int(start), int(stop))

        window_start = max(exons[0] - args.window, 0)
        window_stop = exons[-1] + args.window

        data = r.getExonValuesByExons(chr, window_start, window_stop)
        _ = data.smooth()

        plt.gcf().clear()
        fig = plt.figure(figsize=(10, 5))
        ax = fig.add_subplot(111)

        ax.plot(data.rpkm, linewidth=0.3, c='k')

        ax.plot(data.getSample([sampleID]), linewidth=1, c='r', label=sampleID)
        cf.plotRawData(ax,
                       r.getExonValuesByExons(chr,
                                              window_start,
                                              window_stop,
                                              sampleList=[sampleID]).getSample(
                                                  [sampleID]),
                       color='r')

        plt.legend(prop={'size': 10}, frameon=False)

        cf.plotGenes(ax, data)
        cf.plotGenomicCoords(plt, data)

        exon_start = np.where(data.exons["start"] == start)[0][0]
        exon_stop = np.where(data.exons["stop"] == stop)[0][0]
        _ = ax.add_line(
            matplotlib.lines.Line2D([exon_start, exon_stop], [2, 2],
                                    color='k',
                                    lw=6,
                                    linestyle='-',
                                    alpha=1,
                                    solid_capstyle='butt'))

        _ = plt.xlim(0, data.shape[1])
        _ = plt.ylim(-3, 3)

        plt.title(
            "%s: %s - %s" %
            (cf.chrInt2Str(chr), locale.format("%d", start, grouping=True),
             locale.format("%d", stop, grouping=True)))
        plt.xlabel("Position")
        plt.ylabel("SVD-ZRPKM Values")
        outfile = "%s_%d_%d_%s.png" % (cf.chrInt2Str(chr), start, stop,
                                       sampleID)
        plt.savefig(args.outputdir + "/" + outfile)
Beispiel #4
0
def CF_call(args):
    try:
        h5file_in_fn = str(args.input)
        h5file_in = openFile(h5file_in_fn, mode='r')
    except IOError as e:
        print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn
        sys.exit(0)

    try:
        callfile_fn = str(args.output)
        callfile_f = open(callfile_fn, mode='w')
    except IOError as e:
        print '[ERROR] Cannot open output file for writing: ', callfile_fn
        sys.exit(0)

    chrs_to_process = []
    for chr in h5file_in.root:
        if chr._v_title not in ('probes', 'samples'):
            chrs_to_process.append(chr._v_title.replace("chr", ""))

    h5file_in.close()

    print '[INIT] Initializing caller at threshold = %f' % (args.threshold)

    r = cf.rpkm_reader(h5file_in_fn)

    all_calls = []

    for chr in chrs_to_process:
        print '[RUNNING] Now processing chr%s' % chr
        data = r.getExonValuesByRegion(chr)

        #raw_data = copy.copy(data)
        _ = data.smooth()

        mean = np.mean(data.rpkm, axis=1)
        sd = np.std(data.rpkm, axis=1)

        for sample in r.getSampleList():
            sample_data = data.getSample([sample]).flatten()
            #sample_raw_data = raw_data.getSample([sample]).flatten()

            dup_mask = sample_data >= args.threshold
            del_mask = sample_data <= -1 * args.threshold

            dup_bkpoints = cf.getbkpoints(
                dup_mask
            )  #returns exon coordinates for this chromosome (numpy array coords)
            del_bkpoints = cf.getbkpoints(del_mask)

            dups = []
            for start, stop in dup_bkpoints:
                try:
                    new_start = np.max(
                        np.where(sample_data[:start] < (mean[:start] +
                                                        3 * sd[:start])))
                except ValueError:
                    new_start = 0
                try:
                    new_stop = stop + np.min(
                        np.where(sample_data[stop:] <
                                 (mean[stop:] + 3 * sd[stop:])))
                except ValueError:
                    new_stop = data.shape[1] - 1
                dups.append({
                    "sampleID": sample,
                    "chromosome": cf.chrInt2Str(chr),
                    "start": data.exons[new_start]["start"],
                    "stop": data.exons[new_stop]["stop"],
                    "state": "dup"
                })

            dels = []
            for start, stop in del_bkpoints:
                try:
                    new_start = np.max(
                        np.where(sample_data[:start] > (-1 * mean[:start] -
                                                        3 * sd[:start])))
                except ValueError:
                    new_start = 0
                try:
                    new_stop = stop + np.min(
                        np.where(sample_data[stop:] >
                                 (-1 * mean[stop:] - 3 * sd[stop:])))
                except ValueError:
                    new_stop = data.shape[1] - 1
                dels.append({
                    "sampleID": sample,
                    "chromosome": cf.chrInt2Str(chr),
                    "start": data.exons[new_start]["start"],
                    "stop": data.exons[new_stop]["stop"],
                    "state": "del"
                })

            dels = cf.mergeCalls(dels)  #merges overlapping calls
            dups = cf.mergeCalls(dups)

            #print sampleID, len(dels), len(dups)

            all_calls.extend(list(dels))
            all_calls.extend(list(dups))

    # print calls to file
    header = ['sampleID', 'chromosome', 'start', 'stop', 'state']

    callfile_f.write('\t'.join(header) + "\n")
    for call in all_calls:
        print "%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"],
                                      call["start"], call["stop"],
                                      call["state"])
        callfile_f.write("%s\t%s\t%d\t%d\t%s\n" %
                         (call["sampleID"], call["chromosome"], call["start"],
                          call["stop"], call["state"]))

    sys.exit(0)
Beispiel #5
0
def CF_plotcalls(args):
	try:
		import matplotlib
		matplotlib.use('Agg')
		import matplotlib.pyplot as plt
		import pylab as P
		from matplotlib.lines import Line2D
		from matplotlib.patches import Rectangle
	except:
		print "[ERROR] One or more of the required modules for plotting cannot be loaded! Are matplotlib and pylab installed?"
		sys.exit(0)
	
	import locale	
	try:
		_ = locale.setlocale(locale.LC_ALL, 'en_US')
	except:
		_ = locale.setlocale(locale.LC_ALL, '')
	
	try: 
		callfile_fn = str(args.calls)
		callfile_f = open(callfile_fn, mode='r')
	except IOError as e: 
		print '[ERROR] Cannot open call file for reading: ', callfile_fn
		sys.exit(0)
	
	all_calls = []
	header = callfile_f.readline()
	
	for line in callfile_f:
		sampleID, chr, start, stop, state = line.strip().split()
		chr = cf.chrStr2Int(chr)
		all_calls.append({"chromosome":int(chr), "start":int(start), "stop":int(stop), "sampleID":sampleID})
	
	r = cf.rpkm_reader(args.input)
	
	for call in all_calls:
		chr = call["chromosome"]
		start = call["start"]
		stop = call["stop"]
		sampleID = call["sampleID"]
		
		exons = r.getExonIDs(chr,int(start),int(stop))
		
		
		window_start = max(exons[0]-args.window,0)
		window_stop = exons[-1]+args.window
		
		data = r.getExonValuesByExons(chr,window_start, window_stop)
		_ = data.smooth()
		
		plt.gcf().clear()
		fig = plt.figure(figsize=(10,5))
		ax = fig.add_subplot(111)
		
		
		ax.plot(data.rpkm, linewidth = 0.3, c='k')
		
		
		ax.plot(data.getSample([sampleID]), linewidth = 1, c='r', label = sampleID)
		cf.plotRawData(ax, r.getExonValuesByExons(chr,window_start, window_stop,sampleList=[sampleID]).getSample([sampleID]),color='r')
		
		plt.legend(prop={'size':10},frameon=False)
		
		cf.plotGenes(ax, data)
		cf.plotGenomicCoords(plt,data)
		
		exon_start = np.where(data.exons["start"] == start)[0][0]
		exon_stop = np.where(data.exons["stop"] == stop)[0][0]
		_ = ax.add_line(matplotlib.lines.Line2D([exon_start,exon_stop],[2,2],color='k',lw=6,linestyle='-',alpha=1,solid_capstyle='butt'))
		
		_ = plt.xlim(0,data.shape[1])
		_ = plt.ylim(-3,3)
		
		plt.title("%s: %s - %s" % (cf.chrInt2Str(chr),locale.format("%d",start, grouping=True),locale.format("%d",stop, grouping=True)))
		plt.xlabel("Position")
		plt.ylabel("SVD-ZRPKM Values")
		outfile = "%s_%d_%d_%s.png" % (cf.chrInt2Str(chr), start, stop, sampleID)
		plt.savefig(args.outputdir + "/" + outfile)
Beispiel #6
0
def CF_call(args):
	try: 
		h5file_in_fn = str(args.input)
		h5file_in = openFile(h5file_in_fn, mode='r')
	except IOError as e: 
		print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn
		sys.exit(0)		
	
	try: 
		callfile_fn = str(args.output)
		callfile_f = open(callfile_fn, mode='w')
	except IOError as e: 
		print '[ERROR] Cannot open output file for writing: ', callfile_fn
		sys.exit(0)
	
	chrs_to_process = []
	for chr in h5file_in.root:
		if chr._v_title not in ('probes','samples'):
			chrs_to_process.append(chr._v_title.replace("chr",""))
	
	h5file_in.close()
	
	print '[INIT] Initializing caller at threshold = %f' % (args.threshold)
	
	r = cf.rpkm_reader(h5file_in_fn)
	
	all_calls = []
	
	for chr in chrs_to_process:
		print '[RUNNING] Now processing chr%s' % chr
		data = r.getExonValuesByRegion(chr)
		
		#raw_data = copy.copy(data)
		_ = data.smooth()
		
		mean= np.mean(data.rpkm,axis=1)
		sd =  np.std(data.rpkm,axis=1)
		
		for sample in r.getSampleList():
			sample_data = data.getSample([sample]).flatten()
			#sample_raw_data = raw_data.getSample([sample]).flatten()
			
			dup_mask = sample_data >= args.threshold
			del_mask = sample_data <= -1*args.threshold
			
			dup_bkpoints = cf.getbkpoints(dup_mask) #returns exon coordinates for this chromosome (numpy array coords)
			del_bkpoints = cf.getbkpoints(del_mask)
			
			
			dups = []
			for start,stop in dup_bkpoints:
				try: new_start =  np.max(np.where(sample_data[:start] < (mean[:start] + 3*sd[:start])))
				except ValueError: new_start = 0
				try: new_stop = stop + np.min(np.where(sample_data[stop:] < (mean[stop:] + 3*sd[stop:])))
				except ValueError: new_stop = data.shape[1]-1
				dups.append({"sampleID":sample,"chromosome":  cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "dup"})
			
			dels = []
			for start,stop in del_bkpoints:	
				try: new_start =  np.max(np.where(sample_data[:start] > (-1*mean[:start] - 3*sd[:start])))
				except ValueError: new_start = 0
				try: new_stop = stop + np.min(np.where(sample_data[stop:] > (-1*mean[stop:] - 3*sd[stop:])))
				except ValueError: new_stop = data.shape[1]-1
				dels.append({"sampleID":sample,"chromosome": cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "del"})
			
			dels = cf.mergeCalls(dels) #merges overlapping calls
			dups = cf.mergeCalls(dups)
			
			#print sampleID, len(dels), len(dups)
			
			all_calls.extend(list(dels))
			all_calls.extend(list(dups))
	
	# print calls to file
	header = ['sampleID','chromosome','start','stop','state']
	
	callfile_f.write('\t'.join(header) + "\n")
	for call in all_calls:
		print "%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"])
		callfile_f.write("%s\t%s\t%d\t%d\t%s\n" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"]))
	
	sys.exit(0)