def do_call(args, fsa_list, params, dbh): cerr('I: Calling non-ladder peaks...') for (fsa, fsa_index) in fsa_list: cverr(3, 'D: calling FSA %s' % fsa.filename) fsa.call(params)
def do_call( args, fsa_list, dbh ): cerr('I: Calling non-ladder peaks...') for (fsa, sample_code) in fsa_list: cverr(3, 'D: calling FSA %s' % fsa.filename) fsa.call(args.marker)
def do_call( args, fsa_list, dbh ): cerr('I: Calling non-ladder peaks...') for (fsa, sample_code) in fsa_list: cverr(3, 'D: calling FSA %s' % fsa.filename) fsa.call(params.Params(), args.marker)
def main(args): if args.verbose != 0: set_verbosity(args.verbose) dbh = None if args.file or args.infile: cverr(4, 'D: opening FSA file(s)') fsa_list = open_fsa(args) elif dbh is None: cverr(4, 'D: connecting to database') dbh = get_dbhandler(args) fsa_list = get_fsa_list(args, dbh) cerr('I: obtained %d FSA' % len(fsa_list)) if args.commit: with transaction.manager: do_facmd(args, fsa_list, dbh) cerr('** COMMIT to database **') elif dbh: cerr('WARNING ** running without database COMMIT! All changes will be discarded!') if not ( args.test or args.y ): keys = input('Do you want to continue [y/n]? ') if not keys.lower().strip().startswith('y'): sys.exit(1) do_facmds(args, fsa_list, dbh) else: do_facmds(args, fsa_list)
def do_align( args, fsa_list, dbh ): cerr('I: Aligning size standards...') for (fsa, sample_code) in fsa_list: cverr(3, 'D: aligning FSA %s' % fsa.filename) fsa.align(params.Params())
def do_merge(args, fsa_list, params): cerr('I: merging smeared peaks...') for (fsa, fsa_index) in fsa_list: print("fsa_index: ", fsa_index) cverr(3, 'D: calling merge for FSA %s' % fsa.filename) fsa.merge(params, args.plot_merged_peaks)
def scan(self, params, offset=0): if self.is_ladder(): alleles = scan_peaks(self, params.ladder) else: alleles = scan_peaks(self, params.ladder, offset) cverr(1, "# scanning %s: %d peak(s)" % (self.marker, len(alleles))) return alleles
def generate_cluster(T, k): grouping = fcluster(T.z, k, criterion='maxclust') groups = defaultdict(list) for i, e in enumerate(grouping): groups[e].append( T.p[i][0] ) clusters = sorted( list(groups.values()), key = lambda x: x[0] ) cverr(3, str(clusters)) return clusters
def generate_cluster(T, k): grouping = fcluster(T.z, k, criterion='maxclust') groups = defaultdict(list) for i, e in enumerate(grouping): groups[e].append(T.p[i][0]) clusters = sorted(list(groups.values()), key=lambda x: x[0]) cverr(3, str(clusters)) return clusters
def do_normalize(args, fsa_list, params): cerr('I: Normalizing all peaks...') # use panel method to set scale factors for all FSA from fatools.lib.fileio.models import Panel panel = Panel.get_panel(args.panel) ladder_means = panel.get_ladder_area_means(fsa_list) # normalize areas for each FSA for (fsa, fsa_index) in fsa_list: cverr(3, 'D: calling normalize for %s' % fsa.filename) fsa.normalize(params, ladder_means)
def do_align(args, fsa_list, _params, f_bad_files, dbh): """ This takes an input list of FSA instances , calls FSA.align for FSA in the list, and returns a list of good FSAs. """ cerr('I: Aligning size standards...') good_fsa = [] for (fsa, fsa_index) in fsa_list: cverr(3, 'D: aligning FSA %s' % fsa.filename) try: fsa.align(_params) good_fsa.append((fsa, fsa_index)) except LadderMismatchException: f_bad_files.write(("LadderMismatch: %s\n") % fsa.filename) return good_fsa
def find_raw_peaks(data, params, offset, expected_peak_number=0): """ params.min_dist params.norm_thres params.min_rfu params.max_peak_number """ #print("expected:", expected_peak_number) # cut and pad data to overcome peaks at the end of array obs_data = np.append(data[offset:], [0, 0, 0]) if False: #expected_peak_number: min_dist = params.min_dist indices = [] norm_threshold = params.norm_thres expected_peak_number = expected_peak_number * 1.8 while len(indices) <= expected_peak_number and norm_threshold > 1e-7: indices = indexes(obs_data, norm_threshold, min_dist) print(len(indices), norm_threshold) norm_threshold *= 0.5 elif False: indices = indexes(obs_data, params.norm_thres, params.min_dist) indices = indexes(obs_data, 1e-7, params.min_dist) cverr(5, '## indices: %s' % str(indices)) cverr(3, '## raw indices: %d' % len(indices)) if len(indices) == 0: return [] # normalize indices if offset > 0: indices = indices + offset # filter peaks by minimum rfu, and by maximum peak number after sorted by rfu peaks = [ Peak(int(i), int(data[i])) for i in indices if (data[i] >= params.min_rfu and params.min_rtime < i < params.max_rtime) ] #peaks = sorted( peaks, key = lambda x: x.rfu )[:params.max_peak_number * 2] #import pprint; pprint.pprint(peaks) #print('======') if expected_peak_number: peaks.sort(key=lambda x: x.rfu, reverse=True) peaks = peaks[:round(expected_peak_number * 2)] peaks.sort(key=lambda x: x.rtime) cverr(3, '## peak above min rfu: %d' % len(peaks)) return peaks
def find_raw_peaks(data, params, offset, expected_peak_number=0): """ params.min_dist params.norm_thres params.min_rfu params.max_peak_number """ #print("expected:", expected_peak_number) # cut and pad data to overcome peaks at the end of array obs_data = np.append(data[offset:], [0,0,0]) if False: #expected_peak_number: min_dist = params.min_dist indices = [] norm_threshold = params.norm_thres expected_peak_number = expected_peak_number * 1.8 while len(indices) <= expected_peak_number and norm_threshold > 1e-7: indices = indexes( obs_data, norm_threshold, min_dist) print(len(indices), norm_threshold) norm_threshold *= 0.5 elif False: indices = indexes( obs_data, params.norm_thres, params.min_dist) indices = indexes( obs_data, 1e-7, params.min_dist) cverr(5, '## indices: %s' % str(indices)) cverr(3, '## raw indices: %d' % len(indices)) if len(indices) == 0: return [] # normalize indices if offset > 0: indices = indices + offset # filter peaks by minimum rfu, and by maximum peak number after sorted by rfu peaks = [Peak(int(i), int(data[i])) for i in indices if data[i] >= params.min_rfu and params.min_rtime < i] #peaks = sorted( peaks, key = lambda x: x.rfu )[:params.max_peak_number * 2] #import pprint; pprint.pprint(peaks) #print('======') if expected_peak_number: peaks.sort( key = lambda x: x.rfu, reverse = True ) peaks = peaks[: round(expected_peak_number * 2)] peaks.sort( key = lambda x: x.rtime ) cverr(3, '## peak above min rfu: %d' % len(peaks)) return peaks
def find_peaks( raw_data, params, raw_peaks = None ): """ find all peaks based on the criteria defined in params, and assign as peak-scanned raw_data is baseline-normalized & smoothed trace parameters used are: method: 'cwt' or 'mlpy' widths: window size for peak scanning cwt_min_snr: min_height: min_relative_ratio: max_relative_ratio: min_height_ratio: max_peak_number: """ if raw_peaks is None: raw_peaks = find_raw_peaks( raw_data, params ) # check for any peaks if not raw_peaks: return raw_peaks # only retain 2 * max_peak_number and discard the rest raw_peaks = sorted( raw_peaks, key = lambda x: x[1], reverse = True )[:params.max_peak_number * 2] if params.min_relative_ratio > 0 or params.max_relative_ratio > 0: med = np.median( list(p[1] for p in raw_peaks) ) if params.min_relative_ratio > 0: median_min = med * params.min_relative_ratio raw_peaks = [ p for p in raw_peaks if p[1] > median_min ] if params.max_relative_ratio > 0: median_max = med * params.max_relative_ratio raw_peaks = [ p for p in raw_peaks if p[1] < median_max ] if not raw_peaks: return raw_peaks # filter for minimum height ratio if params.min_height_ratio > 0: min_height = max( list( p[1] for p in raw_peaks) ) * params.min_height_ratio raw_peaks = [ p for p in raw_peaks if p[1] > min_height ] # calculate area (q50, q75) = np.percentile( raw_data, [ 50, 75 ] ) peaks = [] for (peak, height) in raw_peaks: area, brtime, ertime, srtime, ls, rs = calculate_area( raw_data, peak, 5e-2, q50 ) wrtime = ertime - brtime if wrtime < 3: continue beta = area / height theta = height / wrtime if height >= 25 and beta * theta < 6: #10: continue if height < 25 and beta * theta < 3: #6: continue peaks.append( (peak, height, area, brtime, ertime, srtime, beta, theta) ) peaks.sort() cverr(3, 'peaks stage 1 size: %d' % len(peaks)) cverr(3, 'peaks stage 1: %s' % repr(peaks)) non_artifact_peaks = [] for idx in range(len(peaks)): peak = peaks[idx] if idx > 0: prev_p = peaks[idx-1] if peak[3] - prev_p[4] < 5 and peak[1] < params.artifact_ratio * prev_p[1]: # we are artifact, just skip continue if idx < len(peaks)-1: next_p = peaks[idx+1] if next_p[3] - peak[4] < 5 and peak[1] < params.artifact_ratio * next_p[1]: # we are another artifact, just skip continue non_artifact_peaks.append( peak ) cverr(3, 'max_peak_number: %d' % params.max_peak_number) sorted_peaks = sorted( non_artifact_peaks, key = lambda x: (x[1], x[6] * x[7]), reverse=True )[:params.max_peak_number] peaks = sorted( sorted_peaks ) cverr(3, 'peaks stage 3 size: %d' % len(peaks)) cverr(3, 'peaks stage 3: %s' % repr(peaks)) return peaks
def find_raw_peaks( raw_data, params ): max_height = max(raw_data) width_ratio = max(1, round(math.log( max_height/params.width_ratio ))) widths = params.widths if params.method == 'cwt': from scipy.signal import find_peaks_cwt indices = find_peaks_cwt( raw_data, widths, min_snr = params.min_snr ) #cerr('find_peaks_cwt() found %d peaks' % len(indices)) #pprint.pprint(indices) elif params.method == 'relmax': indice_set = [] from scipy.signal import argrelmax for i in (params.widths * width_ratio): indice_set.append( argrelmax( raw_data, order=i+5 )[0] ) # get consensus indices = filter_by_snr( get_consensus_indices( indice_set ), raw_data, params.min_snr * 3.5 ) #print('indices => %d' % len(indices)) #pprint.pprint( indices ) elif params.method == 'mlpy': indice_set = [] from mlpy import findpeaks_win for i in params.widths: indice_set.append( findpeaks_win( raw_data, span=i ) ) # get consensus indices = filter_by_snr( get_consensus_indices( indice_set ), raw_data, params.min_snr ) elif params.method == 'pd': from peakutils import indexes indices = indexes( raw_data, 1e-5, 10 ) #pprint.pprint(indices) cverr(3, 'indice size: %d' % len(indices)) cverr(3, 'indices => %s' % repr(indices)) else: raise RuntimeError('unknown peak finding method: %s' % params.method) if indices is None or len(indices) == 0: return [] # filter for absolute heights within proximity # special cases for pd (peak detect) method: if params.method == 'pd': return [ ( int(i), int(raw_data[i]) ) for i in indices if raw_data[i] > params.min_height and params.min_rtime < i < params.max_rtime ] raw_peaks = [] max_len = len(raw_data) for idx in indices: if not (params.min_rtime < idx < params.max_rtime): continue height, index = max( [ (raw_data[i], i) for i in range(max(0, idx-3), min(max_len,idx+3) ) ] ) if height < params.min_height: continue if (index, height) in raw_peaks: continue raw_peaks.append( (index, height) ) return raw_peaks
def scan_peaks( channel, params, peakdb ): """ scan for peaks based on the criteria defined in params, set as peak-scanned, and prepare the channel data structure """ if peakdb: raw_peaks = pickle.loads(peakdb.Get(channel.tag().encode())) else: raw_peaks = None initial_peaks = find_peaks( channel.data, params, raw_peaks) # peaks = ( rtime, height, area, brtime, ertime ) #cerr('DEBUG - initial peaks: %d' % len(initial_peaks)) cverr(3, 'initial peaks: %d' % len(initial_peaks)) # perform futher cleaning for ladder channels if params.expected_peak_number: epn = params.expected_peak_number peak_qualities = sorted([ (p[6] * p[7], p) for p in initial_peaks ], reverse=True) low_scores = [ q[0] for q in peak_qualities[round(epn/3):round(epn * 1.5)] ] avg_low_score = sum(low_scores) / len(low_scores) ratio_low_score = (avg_low_score - low_scores[-1]) / low_scores[-1] if avg_low_score < 75: # questionable quality, please use more peaks score_threshold = 4 #avg_low_score * 0.1 height_threshold = 6 else: if avg_low_score - low_scores[-1] > low_scores[-1]: # peaks' height are likely not to evenly distributed score_threshold = max(low_scores[-1] * 0.90, 4) else: score_threshold = avg_low_score * 0.25 height_threshold = 10 cverr(3, 'using score threshold: %f' % score_threshold) cverr(3, 'using height_threshold: %d' % height_threshold) peaks = [ q for q in peak_qualities if q[0] > score_threshold and q[1][1] > height_threshold ] cverr(3, 'after peak quality filtering: %d' % len(peaks)) if len(peaks) > 1.5 * params.expected_peak_number: # try to remove peaks further saved_peaks = peaks while len(peaks) - len(saved_peaks) < 0.30 * len(peaks) and height_threshold < 20: height_threshold += 1 saved_peaks = [ q for q in saved_peaks if q[0] > height_threshold ] peaks = saved_peaks cverr(3, 'after reducing peaks number by height: %d' % len(peaks)) peaks = sorted( [ q[1] for q in peaks ] ) else: peaks = initial_peaks # create alleles based on these peaks alleles = [] for peak in peaks: ( rtime, height, area, brtime, ertime, srtime, beta, theta ) = peak wrtime = ertime - brtime height = round(height) allele = channel.new_allele( rtime = rtime, height = height, area = area, brtime = brtime, ertime = ertime, wrtime = wrtime, srtime = srtime, beta = beta, theta = theta ) allele.type = peaktype.scanned allele.method = binningmethod.notavailable allele.marker = channel.marker alleles.append( allele ) return alleles
def do_listrawdata(args, fsa_list, dbh): outfile = '-' if args.outfile != '-': print("outfile: ", args.outfile) outfile = args.outfile.rsplit('.', 1)[0] outfile += "_rawdata." outfile += args.outfile.rsplit('.', 1)[1] out_stream = open(outfile, 'w') else: out_stream = sys.stdout out_stream.write('SAMPLE NAME,WELL ID,TRACE DYE,RAW DATA\n') out_stream.close() for (fsa, fsa_index) in fsa_list: cverr(3, 'D: calling FSA %s' % fsa.filename) if outfile != '-': out_stream = open(outfile, 'a') else: out_stream = sys.stdout # sample name sample_name = fsa.filename.rsplit('.', 1)[0] # iterate through channels markers = fsa.panel.data['markers'] trace = fsa.get_trace() # get well ID well_id = fsa.get_well_id() for channel in fsa.channels: # get trace dye if channel.is_ladder(): trace_dye = markers['x/ladder']['filter'] else: trace_dye = markers['x/' + channel.dye]['filter'] # get raw data data = channel.data datastring = "[" data = channel.data basepairs = channel.get_basepairs() #channel.set_basepairs(fsa.allele_fit_func) for i in range(len(data)): rfu = data[i] bp = basepairs[i] if basepairs else -999 if bp > -999: datastring += "[%i,%.2f,%i]," % (i, bp, rfu) else: datastring += "[%i,null,%i]," % (i, rfu) datastring = datastring[:-1] + "]" out_stream.write("\"%10s\",\"%s\",\"%s\",\"%s\"\n" % (sample_name, well_id, trace_dye, datastring)) out_stream.close()
def align_hc( peaks, ladder): """ peaks: list of rtime, in ascending order ladders: list of size from ladders, in ascending order returns: (score, msg, result, method) """ #import pprint; pprint.pprint(peaks) # generate P for ladder if 'C' not in ladder: if 'T' not in ladder: ladder['T'] = generate_tree( [ (n,0) for n in ladder['sizes'] ] ) ladder['C'] = generate_cluster(ladder['T'], ladder['k']) ladder_clusters = ladder['C'] ladder_sizes = ladder['sizes'] P = generate_tree( [ (n.rtime, 0) for n in peaks ] ) peak_clusters = generate_cluster( P, ladder['k'] ) # generate cluster should use so-called balance tree print(peak_clusters) if len(peak_clusters[-1]) == 1: if len( reduce(operator.add, peak_clusters ) ) > len(ladder_sizes): del peak_clusters[-1] #del peaks[-1] if len(peak_clusters[0]) == 1: if len( reduce(operator.add, peak_clusters ) ) > len(ladder_sizes): del peak_clusters[0] #del peaks[0] if len(peak_clusters) < ladder['k']: P = generate_tree( [ (n, 0) for n in reduce(operator.add, peak_clusters) ] ) peak_clusters = generate_cluster(P, ladder['k']) # short cut, in case we have good high quality peaks if sum( len(c) for c in peak_clusters ) == len(ladder_sizes): hq_peaks = sum(peak_clusters, []) #hq_pairs = zip(hq_peaks, ladder_sizes) zres = estimate_z(hq_peaks, ladder_sizes) dp_result = align_dp( hq_peaks, ladder_sizes, [1.0] * len(hq_peaks), zres.z, zres.rss ) dp_result.sized_peaks = pair_sized_peaks(peaks, dp_result.sized_peaks) score, msg = ladder['qcfunc']( dp_result, method = 'relax') if score > 0.9: return AlignResult(score, msg, dp_result, const.alignmethod.hcm_strict) #print(">>> clusters:\n", peak_clusters) cluster_pairings, expected_missing = align_clusters( peak_clusters, ladder_clusters ) #print(">>> cluster pairs:\n", cluster_pairings) # check each cluster pairing initial_pairs = [] for pairs in cluster_pairings: if is_good_pairing(pairs): initial_pairs.extend( pairs ) else: cverr(3, '>> this pairings is not included:\n%s' % pairs) cverr(3, '>> initial pairs:\n%s' % initial_pairs) if not initial_pairs: return AlignResult(-1, 'E: no initial pairs defined!', None, None) # try to dp align the initial pairs as a shortcut for good sample or peaks rtimes, sizes = zip( *initial_pairs ) zres = estimate_z(rtimes, sizes) dp_result = align_dp( [p.rtime for p in peaks], ladder_sizes, generate_similarity(peaks), zres.z, zres.rss ) dp_result.sized_peaks = pair_sized_peaks(peaks, dp_result.sized_peaks) score, msg = ladder['qcfunc']( dp_result, method = 'strict') if score > 0.9: return AlignResult(score, msg, dp_result, const.alignmethod.hcm_strict) return AlignResult(-1, 'ERR: alignment needs minimization', None, None, initial_pairs=initial_pairs)
def filter_for_artifact(peaks, params, expected_peak_number = 0): """ params.max_peak_number params.artifact_ratio params.artifact_dist ~ 5 """ # the following code in this function performs the necessary acrobatic act # to select the most likely peaks that can be considered as true signals, # which is especially necessary for ladder - size assignment if len(peaks) == expected_peak_number: return peaks # we need to adapt to the noise level of current channel if expected_peak_number > 0: epn = expected_peak_number theta_peaks = sorted(peaks, key = lambda x: x.theta, reverse=True)[round(epn/2)+3:epn-1] #theta_peaks = theta_peaks[2:4] + theta_peaks[round(epn/2):epn-1] omega_peaks = sorted(peaks, key = lambda x: x.omega, reverse=True) omega_peaks = omega_peaks[2:4] + omega_peaks[round(epn/2):epn-1] rfu_peaks = sorted(peaks, key = lambda x: x.rfu, reverse=True)[:epn-1] if theta_peaks[-1].theta < 8: theta_peaks.sort() thetas = np.array([ p.theta for p in theta_peaks ]) rtimes = [ p.rtime for p in theta_peaks ] #plt.scatter(rtimes, thetas) #plt.show() popt, pcov = curve_fit( math_func, rtimes, 0.5 * thetas, p0 = [ -1, 1 ]) if is_verbosity(4): xx = np.linspace( rtimes[0], rtimes[-1]+2000, 100 ) yy = math_func(xx, *popt) plt.plot(xx, yy) plt.scatter( [p.rtime for p in peaks], [p.theta for p in peaks]) plt.show() q_theta = lambda x: x.theta >= math_func(x.rtime, *popt) or x.theta > 100 else: q_theta = lambda x: x.theta >= min(theta_peaks[-1].theta, params.min_theta) if omega_peaks[-1].omega < 200: omega_peaks.sort() omegas = np.array([ p.omega for p in omega_peaks ]) rtimes = np.array([ p.rtime for p in omega_peaks ]) # generate a quadratic threshold for omega # generate a quadratic ratio series first popt, pcov = curve_fit( quadratic_math_func, [rtimes[0], (rtimes[0] + rtimes[-1])/2, rtimes[-1]], [0.05, 0.25, 0.05]) ratios = quadratic_math_func(rtimes, *popt) if is_verbosity(4): plt.plot(rtimes, ratios) plt.show() # use the ratios to enforce quadratic threshold popt, pcov = curve_fit( quadratic_math_func, rtimes, ratios * omegas, p0 = [ -1, 1, 0 ]) if popt[0] > 0: # enforce small flat ratio popt, pcov = curve_fit( math_func, rtimes, 0.25 * omegas, p0 = [ 1, 0 ]) popt = np.insert(popt, 0, 0.0) # convert to 3 params if is_verbosity(4): plt.scatter(rtimes, omegas) xx = np.linspace( rtimes[0], rtimes[-1]+2000, 100 ) yy = quadratic_math_func(xx, *popt) plt.plot(xx, yy) plt.scatter( [p.rtime for p in peaks], [p.omega for p in peaks]) plt.show() q_omega = lambda x: ( x.omega >= 100 or x.omega >= quadratic_math_func(x.rtime, *popt) ) else: q_omega = lambda x: x.omega >= min(omega_peaks[-1].omega, 50) min_rfu = rfu_peaks[-1].rfu * 0.125 else: min_theta = 0 min_omega = 0 min_theta_omega = 0 min_rfu = 2 # filter for too sharp/thin peaks filtered_peaks = [] for p in peaks: #filtered_peaks.append(p); continue cverr(5, str(p)) if len(filtered_peaks) < 2 and p.area > 50: # first two real peaks might be a bit lower filtered_peaks.append(p) continue if not q_omega(p): cverr(5, '! q_omega') continue #if not q_theta(p): # print('! q_theta') # continue #if min_theta and min_omega and p.omega < min_omega and p.theta < min_theta: # print('! omega & theta') # continue #if min_theta_omega and p.theta * p.omega < min_theta_omega: # print('! theta_omega') # continue if p.theta < 1.0 and p.area < 25 and p.omega < 5: cverr(5, '! extreme theta & area & omega') continue if p.rfu < min_rfu: cverr(5, '! extreme min_rfu') continue if p.beta > 25 and p.theta < 0.5: cverr(5, '! extreme beta') continue if p.wrtime < 3: continue if p.rfu >= 25 and p.beta * p.theta < 6: continue if p.rfu < 25 and p.beta * p.theta < 3: continue #if p.omega < 50: # continue #if p.omega < 100 and p.theta < 5: # continue #if ( params.max_beta and min_theta and # (p.beta > params.max_beta and p.theta < min_theta) ): # print('! max_beta') # continue filtered_peaks.append(p) #import pprint; pprint.pprint(filtered_peaks) # filter for distance between peaks and their rfu ratio peaks = sorted(filtered_peaks, key = lambda x: x.rtime) non_artifact_peaks = [] for idx in range(len(peaks)): p = peaks[idx] if idx > 0: prev_p = peaks[idx-1] if ( p.brtime - prev_p.ertime < params.artifact_dist and p.rfu < params.artifact_ratio * prev_p.rfu ): # we are artifact, just skip print('artifact1:', p) continue if idx < len(peaks)-1: next_p = peaks[idx+1] if ( next_p.brtime - p.ertime < params.artifact_dist and p.rfu < params.artifact_ratio * next_p.rfu ): # we are artifact, just skip print('artefact2:', p) continue non_artifact_peaks.append( p ) #import pprint; pprint.pprint(non_artifact_peaks) #print(len(non_artifact_peaks)) peaks = non_artifact_peaks cverr(3, '## non artifact peaks: %d' % len(peaks)) return peaks
def align_pm(peaks, ladder, anchor_pairs=None): if not anchor_pairs: longest_rtime_peak = max([p.rtime for p in peaks]) if longest_rtime_peak > PEAK_RTIME_UPPER_BOUND: bound_adjust_ratio = longest_rtime_peak / PEAK_RTIME_UPPER_BOUND anchor_start = ANCHOR_RTIME_LOWER_BOUND * bound_adjust_ratio anchor_end = ANCHOR_RTIME_UPPER_BOUND * bound_adjust_ratio else: anchor_start = ANCHOR_RTIME_LOWER_BOUND anchor_end = ANCHOR_RTIME_UPPER_BOUND anchor_peaks = [ p for p in peaks if anchor_start < p.rtime < anchor_end ] anchor_pairs, initial_z = estimate_pm(anchor_peaks, ladder['signature']) else: rtimes, bpsizes = zip(*anchor_pairs) initial_z = estimate_z(rtimes, bpsizes, 1) anchor_pairs.sort() pairs, z, rss, f = align_upper_pm(peaks, ladder, anchor_pairs, initial_z) #print(pairs) pairs, z, rss, f = align_lower_pm(peaks, ladder, pairs, initial_z) #print(rss) #plot(f.rtimes, f.sizes, z, pairs) # last dp dp_result = align_dp(f.rtimes, f.sizes, f.similarity, z, rss) if is_verbosity(1): import pprint pprint.pprint(dp_result.sized_peaks) if is_verbosity(4): plot(f.rtimes, f.sizes, dp_result.z, [(x[1], x[0]) for x in dp_result.sized_peaks]) dp_result.sized_peaks = f.get_sized_peaks(dp_result.sized_peaks) score, msg = ladder['qcfunc'](dp_result, method='strict') if score > 0.9: return AlignResult(score, msg, dp_result, const.alignmethod.pm_strict) score, msg = ladder['qcfunc'](dp_result, method='relax') return AlignResult(score, msg, dp_result, const.alignmethod.pm_relax) f = ZFunc(peaks, ladder['sizes'], anchor_pairs) z = initial_z score = last_score = 0 last_z = None for order in [1, 2, 3]: last_rss = -1 rss = 0 niter = 0 while abs(rss - last_rss) > 1e-3: niter += 1 cverr(5, 'Iter: %d' % niter) cverr(5, z) score = f(z) if last_score and last_score < score: # score does not converge; just exit cverr(5, 'does not converge!') break pairs, cur_rss = f.get_pairs(z) rtimes, bpsizes = zip(*pairs) zres = estimate_z(rtimes, bpsizes, order) last_z = z z = zres.z last_rss = rss rss = zres.rss cverr(5, rss) dp_result = align_dp(f.rtimes, f.sizes, last_z, last_rss) return align_gm2(peaks, ladder, anchor_pairs, dp_result.z) new_anchor_pairs = [] zf = np.poly1d(dp_result.z) for p in dp_result.sized_peaks: if (p[0] - zf(p[1]))**2 < 2: new_anchor_pairs.append((p[1], p[0])) #import pprint; pprint.pprint(dp_result.sized_peaks) plot(f.rtimes, f.sizes, dp_result.z, [(x[1], x[0]) for x in dp_result.sized_peaks]) return align_gm(peaks, ladder, anchor_pairs, dp_result.z)
def align_hc(peaks, ladder): """ peaks: list of rtime, in ascending order ladders: list of size from ladders, in ascending order returns: (score, msg, result, method) """ #import pprint; pprint.pprint(peaks) # generate P for ladder if 'C' not in ladder: if 'T' not in ladder: ladder['T'] = generate_tree([(n, 0) for n in ladder['sizes']]) ladder['C'] = generate_cluster(ladder['T'], ladder['k']) ladder_clusters = ladder['C'] ladder_sizes = ladder['sizes'] P = generate_tree([(n.rtime, 0) for n in peaks]) peak_clusters = generate_cluster(P, ladder['k']) # generate cluster should use so-called balance tree #print(peak_clusters) if len(peak_clusters[-1]) == 1: if len(reduce(operator.add, peak_clusters)) > len(ladder_sizes): del peak_clusters[-1] #del peaks[-1] if len(peak_clusters[0]) == 1: if len(reduce(operator.add, peak_clusters)) > len(ladder_sizes): del peak_clusters[0] #del peaks[0] if len(peak_clusters) < ladder['k']: P = generate_tree([(n, 0) for n in reduce(operator.add, peak_clusters)]) peak_clusters = generate_cluster(P, ladder['k']) # short cut, in case we have good high quality peaks if sum(len(c) for c in peak_clusters) == len(ladder_sizes): hq_peaks = sum(peak_clusters, []) #hq_pairs = zip(hq_peaks, ladder_sizes) zres = estimate_z(hq_peaks, ladder_sizes) dp_result = align_dp(hq_peaks, ladder_sizes, [1.0] * len(hq_peaks), zres.z, zres.rss) dp_result.sized_peaks = pair_sized_peaks(peaks, dp_result.sized_peaks) score, msg = ladder['qcfunc'](dp_result, method='relax') if score > 0.9: return AlignResult(score, msg, dp_result, const.alignmethod.hcm_strict) #print(">>> clusters:\n", peak_clusters) cluster_pairings, expected_missing = align_clusters( peak_clusters, ladder_clusters) #print(">>> cluster pairs:\n", cluster_pairings) # check each cluster pairing initial_pairs = [] for pairs in cluster_pairings: if is_good_pairing(pairs): initial_pairs.extend(pairs) else: cverr(3, '>> this pairings is not included:\n%s' % pairs) cverr(3, '>> initial pairs:\n%s' % initial_pairs) if not initial_pairs: return AlignResult(-1, 'E: no initial pairs defined!', None, None) # try to dp align the initial pairs as a shortcut for good sample or peaks rtimes, sizes = zip(*initial_pairs) zres = estimate_z(rtimes, sizes) dp_result = align_dp([p.rtime for p in peaks], ladder_sizes, generate_similarity(peaks), zres.z, zres.rss) dp_result.sized_peaks = pair_sized_peaks(peaks, dp_result.sized_peaks) score, msg = ladder['qcfunc'](dp_result, method='strict') if score > 0.9: return AlignResult(score, msg, dp_result, const.alignmethod.hcm_strict) return AlignResult(-1, 'ERR: alignment needs minimization', None, None, initial_pairs=initial_pairs)
def main(args): if args.verbose != 0: set_verbosity(args.verbose) dbh = None # set parameter for baseline correction and allelemethod from fatools.lib.const import allelemethod, baselinemethod _params = params.Params() _params.baselinewindow = args.baselinewindow if args.baselinemethod != "": if args.baselinemethod == 'none': _params.baselinemethod = baselinemethod.none elif args.baselinemethod == 'median': _params.baselinemethod = baselinemethod.median elif args.baselinemethod == 'minimum': _params.baselinemethod = baselinemethod.minimum else: raise NotImplementedError() if args.allelemethod != "": if args.allelemethod == 'leastsquare': _params.allelemethod = allelemethod.leastsquare elif args.allelemethod == 'cubicspline': _params.allelemethod = allelemethod.cubicspline elif args.allelemethod == 'localsouthern': _params.allelemethod = allelemethod.localsouthern else: raise NotImplementedError() if args.nonladder_smoothing_window > 0: _params.nonladder.smoothing_window = args.nonladder_smoothing_window _params.nonladder.smoothing_order = args.nonladder_smoothing_order cerr('I: Aligning size standards...') if args.file or args.infile or args.indir: cverr(4, 'D: opening FSA file(s)') fsa_list = open_fsa(args, _params) elif dbh is None: cverr(4, 'D: connecting to database') dbh = get_dbhandler(args) fsa_list = get_fsa_list(args, dbh) cerr('I: obtained %d FSA' % len(fsa_list)) if args.commit: with transaction.manager: do_facmd(args, fsa_list, dbh) cerr('** COMMIT to database **') elif dbh: cerr( 'WARNING ** running without database COMMIT! All changes will be discarded!' ) if not (args.test or args.y): keys = input('Do you want to continue [y/n]? ') if not keys.lower().strip().startswith('y'): sys.exit(1) do_facmds(args, fsa_list, _params, dbh) else: do_facmds(args, fsa_list, _params)
def filter_for_artifact(peaks, params, expected_peak_number=0): """ params.max_peak_number params.artifact_ratio params.artifact_dist ~ 5 """ # the following code in this function performs the necessary acrobatic act # to select the most likely peaks that can be considered as true signals, # which is especially necessary for ladder - size assignment if len(peaks) == expected_peak_number: return peaks # we need to adapt to the noise level of current channel if expected_peak_number > 0: epn = expected_peak_number theta_peaks = sorted(peaks, key=lambda x: x.theta, reverse=True)[round(epn / 2) + 3:epn - 1] #theta_peaks = theta_peaks[2:4] + theta_peaks[round(epn/2):epn-1] omega_peaks = sorted(peaks, key=lambda x: x.omega, reverse=True) omega_peaks = omega_peaks[2:4] + omega_peaks[round(epn / 2):epn - 1] rfu_peaks = sorted(peaks, key=lambda x: x.rfu, reverse=True)[:epn - 1] if theta_peaks[-1].theta < 8: theta_peaks.sort() thetas = np.array([p.theta for p in theta_peaks]) rtimes = [p.rtime for p in theta_peaks] #plt.scatter(rtimes, thetas) #plt.show() popt, pcov = curve_fit(math_func, rtimes, 0.5 * thetas, p0=[-1, 1]) if is_verbosity(4): xx = np.linspace(rtimes[0], rtimes[-1] + 2000, 100) yy = math_func(xx, *popt) plt.plot(xx, yy) plt.scatter([p.rtime for p in peaks], [p.theta for p in peaks]) plt.show() q_theta = lambda x: x.theta >= math_func(x.rtime, *popt ) or x.theta > 100 else: q_theta = lambda x: x.theta >= min(theta_peaks[-1].theta, params. min_theta) if omega_peaks[-1].omega < 200: omega_peaks.sort() omegas = np.array([p.omega for p in omega_peaks]) rtimes = np.array([p.rtime for p in omega_peaks]) # generate a quadratic threshold for omega # generate a quadratic ratio series first popt, pcov = curve_fit( quadratic_math_func, [rtimes[0], (rtimes[0] + rtimes[-1]) / 2, rtimes[-1]], [0.05, 0.25, 0.05]) ratios = quadratic_math_func(rtimes, *popt) if is_verbosity(4): plt.plot(rtimes, ratios) plt.show() # use the ratios to enforce quadratic threshold popt, pcov = curve_fit(quadratic_math_func, rtimes, ratios * omegas, p0=[-1, 1, 0]) if popt[0] > 0: # enforce small flat ratio popt, pcov = curve_fit(math_func, rtimes, 0.25 * omegas, p0=[1, 0]) popt = np.insert(popt, 0, 0.0) # convert to 3 params if is_verbosity(4): plt.scatter(rtimes, omegas) xx = np.linspace(rtimes[0], rtimes[-1] + 2000, 100) yy = quadratic_math_func(xx, *popt) plt.plot(xx, yy) plt.scatter([p.rtime for p in peaks], [p.omega for p in peaks]) plt.show() q_omega = lambda x: (x.omega >= 100 or x.omega >= quadratic_math_func(x.rtime, *popt)) else: q_omega = lambda x: x.omega >= min(omega_peaks[-1].omega, 50) min_rfu = rfu_peaks[-1].rfu * 0.125 else: min_theta = 0 min_omega = 0 min_theta_omega = 0 min_rfu = 2 # filter for too sharp/thin peaks filtered_peaks = [] for p in peaks: #filtered_peaks.append(p); continue cverr(5, p) if len(filtered_peaks) < 2 and p.area > 50: # first two real peaks might be a bit lower filtered_peaks.append(p) continue if not q_omega(p): cverr(5, '! q_omega') continue #if not q_theta(p): # print('! q_theta') # continue #if min_theta and min_omega and p.omega < min_omega and p.theta < min_theta: # print('! omega & theta') # continue #if min_theta_omega and p.theta * p.omega < min_theta_omega: # print('! theta_omega') # continue if p.theta < 1.0 and p.area < 25 and p.omega < 5: cverr(5, '! extreme theta & area & omega') continue if p.rfu < min_rfu: cverr(5, '! extreme min_rfu') continue if p.beta > 25 and p.theta < 0.5: cverr(5, '! extreme beta') continue if p.wrtime < 3: continue if p.rfu >= 25 and p.beta * p.theta < 6: continue if p.rfu < 25 and p.beta * p.theta < 3: continue #if p.omega < 50: # continue #if p.omega < 100 and p.theta < 5: # continue #if ( params.max_beta and min_theta and # (p.beta > params.max_beta and p.theta < min_theta) ): # print('! max_beta') # continue filtered_peaks.append(p) #import pprint; pprint.pprint(filtered_peaks) # filter for distance between peaks and their rfu ratio peaks = sorted(filtered_peaks, key=lambda x: x.rtime) non_artifact_peaks = [] for idx in range(len(peaks)): p = peaks[idx] if idx > 0: prev_p = peaks[idx - 1] if (p.brtime - prev_p.ertime < params.artifact_dist and p.rfu < params.artifact_ratio * prev_p.rfu): # we are artifact, just skip print('artifact1:', p) continue if idx < len(peaks) - 1: next_p = peaks[idx + 1] if (next_p.brtime - p.ertime < params.artifact_dist and p.rfu < params.artifact_ratio * next_p.rfu): # we are artifact, just skip print('artefact2:', p) continue non_artifact_peaks.append(p) #import pprint; pprint.pprint(non_artifact_peaks) #print(len(non_artifact_peaks)) peaks = non_artifact_peaks cverr(3, '## non artifact peaks: %d' % len(peaks)) return peaks
def align_pm(peaks, ladder, anchor_pairs=None): if not anchor_pairs: longest_rtime_peak = max([p.rtime for p in peaks]) if longest_rtime_peak > PEAK_RTIME_UPPER_BOUND: bound_adjust_ratio = longest_rtime_peak / PEAK_RTIME_UPPER_BOUND anchor_start = ANCHOR_RTIME_LOWER_BOUND * bound_adjust_ratio anchor_end = ANCHOR_RTIME_UPPER_BOUND * bound_adjust_ratio else: anchor_start = ANCHOR_RTIME_LOWER_BOUND anchor_end = ANCHOR_RTIME_UPPER_BOUND anchor_peaks = [ p for p in peaks if anchor_start < p.rtime < anchor_end ] anchor_pairs, initial_z = estimate_pm( anchor_peaks, ladder['signature'] ) else: rtimes, bpsizes = zip( *anchor_pairs ) initial_z = estimate_z(rtimes, bpsizes, 1) anchor_pairs.sort() pairs, z, rss, f = align_upper_pm(peaks, ladder, anchor_pairs, initial_z) #print(pairs) pairs, z, rss, f = align_lower_pm(peaks, ladder, pairs, initial_z) #print(rss) #plot(f.rtimes, f.sizes, z, pairs) # last dp dp_result = align_dp(f.rtimes, f.sizes, f.similarity, z, rss) if is_verbosity(1): import pprint; pprint.pprint(dp_result.sized_peaks) if is_verbosity(4): plot(f.rtimes, f.sizes, dp_result.z, [(x[1], x[0]) for x in dp_result.sized_peaks]) dp_result.sized_peaks = f.get_sized_peaks(dp_result.sized_peaks) score, msg = ladder['qcfunc'](dp_result, method='strict') if score > 0.9: return AlignResult(score, msg, dp_result, const.alignmethod.pm_strict) score, msg = ladder['qcfunc'](dp_result, method='relax') return AlignResult(score, msg, dp_result, const.alignmethod.pm_relax) f = ZFunc(peaks, ladder['sizes'], anchor_pairs) z = initial_z score = last_score = 0 last_z = None for order in [1, 2, 3]: last_rss = -1 rss = 0 niter = 0 while abs(rss - last_rss) > 1e-3: niter += 1 cverr(5, 'Iter: %d' % niter) cverr(5, z) score = f(z) if last_score and last_score < score: # score does not converge; just exit cverr(5, 'does not converge!') break pairs, cur_rss = f.get_pairs(z) rtimes, bpsizes = zip( *pairs ) zres = estimate_z(rtimes, bpsizes, order) last_z = z z = zres.z last_rss = rss rss = zres.rss cverr(5, rss) dp_result = align_dp(f.rtimes, f.sizes, last_z, last_rss) return align_gm2(peaks, ladder, anchor_pairs, dp_result.z) new_anchor_pairs = [] zf = np.poly1d(dp_result.z) for p in dp_result.sized_peaks: if (p[0] - zf(p[1]))**2 < 2: new_anchor_pairs.append( (p[1], p[0]) ) #import pprint; pprint.pprint(dp_result.sized_peaks) plot(f.rtimes, f.sizes, dp_result.z, [(x[1], x[0]) for x in dp_result.sized_peaks]) return align_gm(peaks, ladder, anchor_pairs, dp_result.z)
def do_listpeaks(args, fsa_list, dbh): if args.outfile != '-': out_stream = open(args.outfile, 'w') else: out_stream = sys.stdout if args.peaks_format == 'standard': out_stream.write( 'SAMPLE\tFILENAME \tDYE\tRTIME\tSIZE\tHEIGHT\tAREA\tSCORE\n') elif args.peaks_format == 'peakscanner': out_stream.write( "Dye/Sample Peak,Sample File Name,Type,Size,Height,Area in Point,Area in BP,Corrected Area in BP,Data Point,Begin Point," ) if args.merge: out_stream.write( "Begin BP,End Point,End BP,Width in Point,Width in BP,Score,Peak Group,User Comments,User Edit\n" ) else: out_stream.write( "Begin BP,End Point,End BP,Width in Point,Width in BP,Score,User Comments,User Edit\n" ) else: raise RuntimeError("Unknown value for args.peaks_format") out_stream.close() for (fsa, fsa_index) in fsa_list: cverr(3, 'D: calling FSA %s' % fsa.filename) markers = fsa.panel.data['markers'] if args.outfile != '-': out_stream = open(args.outfile, 'a') else: out_stream = sys.stdout for channel in fsa.channels: if channel.is_ladder(): color = markers['x/ladder']['filter'] else: color = markers['x/' + channel.dye]['filter'] alleles = channel.get_alleles(broad_peaks_only=False) if is_verbosity(4): cout('Marker => %s | %s [%d]' % (channel.marker.code, channel.dye, len(alleles))) cout("channel has alleles :", len(alleles)) i = 1 smeared_alleles = channel.smeared_alleles if (not args.merge) or channel.is_ladder(): for p in alleles: if args.peaks_format == 'standard': out_stream.write( '%6s\t%10s\t%3s\t%d\t%d\t%5i\t%3.2f\t%3.2f\n' % (fsa_index, fsa.filename[:-4], color, p.rtime, p.size, p.height, p.area, p.qscore)) else: out_stream.write( '"%s, %i",%s, %s, %f, %i, %i, %i, %i, %i, %i, %f, %i, %f, %i, %f, %f,,\n' % (color, i, fsa.filename, p.type, p.size, p.height, p.area, p.area_bp, p.area_bp_corr, p.rtime, p.brtime, p.begin_bp, p.ertime, p.end_bp, p.wrtime, p.width_bp, p.qscore)) i = i + 1 else: if is_verbosity(4): cout('Marker => %s | %s [%d]' % (channel.marker.code, channel.dye, len(smeared_alleles))) cout("channel has smeared alleles :", len(smeared_alleles)) i = 1 for p in smeared_alleles: out_stream.write( '"%s, %i", %s, %s, %f, %i, %i, %i, %i, %i, %i, %f, %i, %f, %i, %f, %f, %i,,\n' % (color, i, fsa.filename, p.type, p.size, p.height, p.area, p.area_bp, p.area_bp_corr, p.rtime, p.brtime, p.begin_bp, p.ertime, p.end_bp, p.wrtime, p.width_bp, p.qscore, p.group)) i = i + 1 out_stream.close()