Example #1
0
def CF_call(args):
    try:
        h5file_in_fn = str(args.input)
        h5file_in = openFile(h5file_in_fn, mode='r')
    except IOError as e:
        print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn
        sys.exit(0)

    try:
        callfile_fn = str(args.output)
        callfile_f = open(callfile_fn, mode='w')
    except IOError as e:
        print '[ERROR] Cannot open output file for writing: ', callfile_fn
        sys.exit(0)

    chrs_to_process = []
    for chr in h5file_in.root:
        if chr._v_title not in ('probes', 'samples'):
            chrs_to_process.append(chr._v_title.replace("chr", ""))

    h5file_in.close()

    print '[INIT] Initializing caller at threshold = %f' % (args.threshold)

    r = cf.rpkm_reader(h5file_in_fn)

    all_calls = []

    for chr in chrs_to_process:
        print '[RUNNING] Now processing chr%s' % chr
        data = r.getExonValuesByRegion(chr)

        #raw_data = copy.copy(data)
        _ = data.smooth()

        mean = np.mean(data.rpkm, axis=1)
        sd = np.std(data.rpkm, axis=1)

        for sample in r.getSampleList():
            sample_data = data.getSample([sample]).flatten()
            #sample_raw_data = raw_data.getSample([sample]).flatten()

            dup_mask = sample_data >= args.threshold
            del_mask = sample_data <= -1 * args.threshold

            dup_bkpoints = cf.getbkpoints(
                dup_mask
            )  #returns exon coordinates for this chromosome (numpy array coords)
            del_bkpoints = cf.getbkpoints(del_mask)

            dups = []
            for start, stop in dup_bkpoints:
                try:
                    new_start = np.max(
                        np.where(sample_data[:start] < (mean[:start] +
                                                        3 * sd[:start])))
                except ValueError:
                    new_start = 0
                try:
                    new_stop = stop + np.min(
                        np.where(sample_data[stop:] <
                                 (mean[stop:] + 3 * sd[stop:])))
                except ValueError:
                    new_stop = data.shape[1] - 1
                dups.append({
                    "sampleID": sample,
                    "chromosome": cf.chrInt2Str(chr),
                    "start": data.exons[new_start]["start"],
                    "stop": data.exons[new_stop]["stop"],
                    "state": "dup"
                })

            dels = []
            for start, stop in del_bkpoints:
                try:
                    new_start = np.max(
                        np.where(sample_data[:start] > (-1 * mean[:start] -
                                                        3 * sd[:start])))
                except ValueError:
                    new_start = 0
                try:
                    new_stop = stop + np.min(
                        np.where(sample_data[stop:] >
                                 (-1 * mean[stop:] - 3 * sd[stop:])))
                except ValueError:
                    new_stop = data.shape[1] - 1
                dels.append({
                    "sampleID": sample,
                    "chromosome": cf.chrInt2Str(chr),
                    "start": data.exons[new_start]["start"],
                    "stop": data.exons[new_stop]["stop"],
                    "state": "del"
                })

            dels = cf.mergeCalls(dels)  #merges overlapping calls
            dups = cf.mergeCalls(dups)

            #print sampleID, len(dels), len(dups)

            all_calls.extend(list(dels))
            all_calls.extend(list(dups))

    # print calls to file
    header = ['sampleID', 'chromosome', 'start', 'stop', 'state']

    callfile_f.write('\t'.join(header) + "\n")
    for call in all_calls:
        print "%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"],
                                      call["start"], call["stop"],
                                      call["state"])
        callfile_f.write("%s\t%s\t%d\t%d\t%s\n" %
                         (call["sampleID"], call["chromosome"], call["start"],
                          call["stop"], call["state"]))

    sys.exit(0)
Example #2
0
def CF_call(args):
	try: 
		h5file_in_fn = str(args.input)
		h5file_in = openFile(h5file_in_fn, mode='r')
	except IOError as e: 
		print '[ERROR] Cannot open CoNIFER input file for reading: ', h5file_in_fn
		sys.exit(0)		
	
	try: 
		callfile_fn = str(args.output)
		callfile_f = open(callfile_fn, mode='w')
	except IOError as e: 
		print '[ERROR] Cannot open output file for writing: ', callfile_fn
		sys.exit(0)
	
	chrs_to_process = []
	for chr in h5file_in.root:
		if chr._v_title not in ('probes','samples'):
			chrs_to_process.append(chr._v_title.replace("chr",""))
	
	h5file_in.close()
	
	print '[INIT] Initializing caller at threshold = %f' % (args.threshold)
	
	r = cf.rpkm_reader(h5file_in_fn)
	
	all_calls = []
	
	for chr in chrs_to_process:
		print '[RUNNING] Now processing chr%s' % chr
		data = r.getExonValuesByRegion(chr)
		
		#raw_data = copy.copy(data)
		_ = data.smooth()
		
		mean= np.mean(data.rpkm,axis=1)
		sd =  np.std(data.rpkm,axis=1)
		
		for sample in r.getSampleList():
			sample_data = data.getSample([sample]).flatten()
			#sample_raw_data = raw_data.getSample([sample]).flatten()
			
			dup_mask = sample_data >= args.threshold
			del_mask = sample_data <= -1*args.threshold
			
			dup_bkpoints = cf.getbkpoints(dup_mask) #returns exon coordinates for this chromosome (numpy array coords)
			del_bkpoints = cf.getbkpoints(del_mask)
			
			
			dups = []
			for start,stop in dup_bkpoints:
				try: new_start =  np.max(np.where(sample_data[:start] < (mean[:start] + 3*sd[:start])))
				except ValueError: new_start = 0
				try: new_stop = stop + np.min(np.where(sample_data[stop:] < (mean[stop:] + 3*sd[stop:])))
				except ValueError: new_stop = data.shape[1]-1
				dups.append({"sampleID":sample,"chromosome":  cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "dup"})
			
			dels = []
			for start,stop in del_bkpoints:	
				try: new_start =  np.max(np.where(sample_data[:start] > (-1*mean[:start] - 3*sd[:start])))
				except ValueError: new_start = 0
				try: new_stop = stop + np.min(np.where(sample_data[stop:] > (-1*mean[stop:] - 3*sd[stop:])))
				except ValueError: new_stop = data.shape[1]-1
				dels.append({"sampleID":sample,"chromosome": cf.chrInt2Str(chr), "start":data.exons[new_start]["start"], "stop": data.exons[new_stop]["stop"], "state": "del"})
			
			dels = cf.mergeCalls(dels) #merges overlapping calls
			dups = cf.mergeCalls(dups)
			
			#print sampleID, len(dels), len(dups)
			
			all_calls.extend(list(dels))
			all_calls.extend(list(dups))
	
	# print calls to file
	header = ['sampleID','chromosome','start','stop','state']
	
	callfile_f.write('\t'.join(header) + "\n")
	for call in all_calls:
		print "%s\t%s\t%d\t%d\t%s" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"])
		callfile_f.write("%s\t%s\t%d\t%d\t%s\n" % (call["sampleID"], call["chromosome"], call["start"], call["stop"], call["state"]))
	
	sys.exit(0)