def pipeline_scan(st, segments=None, cfile=None, vys_timeout=vys_timeout_default, devicenum=None): """ Given rfpipe state run search pipline on all segments in a scan. state/preference has fftmode that will determine functions used here. """ from rfpipe import candidates # initialize with empty cc candcollection = candidates.CandCollection(prefs=st.prefs, metadata=st.metadata) if not isinstance(segments, list): segments = list(range(st.nsegment)) for segment in segments: candcollection += pipeline_seg(st, segment, devicenum=devicenum, cfile=cfile, vys_timeout=vys_timeout) return candcollection
def reproduce_candcollection(cc, data, wisdom=None): """ Calculates canddata for each cand in candcollection. Will look for cluster label and filter only for peak snr, if available. Location (e.g., integration, dm, dt) of each is used to create canddata for each candidate. """ # set up output cc st = cc.state cc1 = candidates.CandCollection(prefs=st.prefs, metadata=st.metadata) if len(cc): candlocs = cc.locs snrs = cc.snrtot if 'cluster' in cc.array.dtype.fields: clusters = cc.array['cluster'].astype(int) cl_rank, cl_count = candidates.calc_cluster_rank(cc) calcinds = np.unique(np.where(cl_rank == 1)[0]) logger.debug("Reproducing cands at {0} cluster peaks" .format(len(calcinds))) else: logger.debug("No cluster field found. Reproducing all.") calcinds = list(range(len(cc))) # reproduce canddata for each calcinds.sort() for i in calcinds: # TODO: check on best way to find max SNR with kalman, etc snr = snrs[i] candloc = candlocs[i] # kwargs passed to canddata object for plotting/saving kwargs = {} if 'cluster' in cc.array.dtype.fields: logger.info("Cluster {0}/{1} has {2} candidates and max SNR {3} at {4}" .format(clusters[i], len(calcinds)-1, cl_count[i], snr, candloc)) # add supplementary plotting and cc info kwargs['cluster'] = clusters[i] kwargs['clustersize'] = cl_count[i] else: logger.info("Candidate {0}/{1} has SNR {2} at {3}" .format(i, len(calcinds)-1, snr, candloc)) # TODO: reproduce these here, too for kw in ['snrk', 'snrarms']: if kw in cc.array.dtype.fields: kwargs[kw] = cc.array[kw][i] # reproduce candidate data_corr = rfpipe.reproduce.pipeline_datacorrect(st, candloc, data) cd = rfpipe.reproduce.pipeline_imdata(st, candloc, data_corr, cpuonly=True, **kwargs) cc1 += candidates.save_and_plot(cd) # TODO: validate that reproduced features match input features? # peakx, peaky = np.where(image[0] == image[0].max()) # l1, m1 = st.calclm(st.npixx_full, st.npixy_full, # st.uvres, peakx[0], peaky[0]) # immax1 = image.max() # snr1 = immax1/image.std() return cc1
def dedisperse_search_cuda(st, segment, data, devicenum=None): """ Run dedispersion, resample for all dm and dt. Grid and image on GPU. rfgpu is built from separate repo. Uses state to define integrations to image based on segment, dm, and dt. devicenum can force the gpu to use, but can be inferred via distributed. """ assert st.dtarr[0] == 1, "st.dtarr[0] assumed to be 1" assert all([st.dtarr[dtind]*2 == st.dtarr[dtind+1] for dtind in range(len(st.dtarr)-1)]), ("dtarr must increase " "by factors of 2") if not np.any(data): logger.info("Data is all zeros. Skipping search.") return candidates.CandCollection(prefs=st.prefs, metadata=st.metadata) if devicenum is None: # assume first gpu, but try to infer from worker name devicenum = 0 try: from distributed import get_worker name = get_worker().name devicenum = int(name.split('g')[1]) logger.debug("Using name {0} to set GPU devicenum to {1}" .format(name, devicenum)) except IndexError: logger.warn("Could not parse worker name {0}. Using default GPU devicenum {1}" .format(name, devicenum)) except ValueError: logger.warn("No worker found. Using default GPU devicenum {0}" .format(devicenum)) except ImportError: logger.warn("distributed not available. Using default GPU devicenum {0}" .format(devicenum)) rfgpu.cudaSetDevice(devicenum) beamnum = 0 uvw = util.get_uvw_segment(st, segment) upix = st.npixx vpix = st.npixy//2 + 1 grid = rfgpu.Grid(st.nbl, st.nchan, st.readints, upix, vpix) image = rfgpu.Image(st.npixx, st.npixy) image.add_stat('rms') image.add_stat('pix') # Data buffers on GPU vis_raw = rfgpu.GPUArrayComplex((st.nbl, st.nchan, st.readints)) vis_grid = rfgpu.GPUArrayComplex((upix, vpix)) img_grid = rfgpu.GPUArrayReal((st.npixx, st.npixy)) # Convert uv from lambda to us u, v, w = uvw u_us = 1e6*u[:, 0]/(1e9*st.freq[0]) v_us = 1e6*v[:, 0]/(1e9*st.freq[0]) # Q: set input units to be uv (lambda), freq in GHz? grid.set_uv(u_us, v_us) # u, v in us grid.set_freq(st.freq*1e3) # freq in MHz grid.set_cell(st.uvres) # uv cell size in wavelengths (== 1/FoV(radians)) # Compute gridding transform grid.compute() # move Stokes I data in (assumes dual pol data) vis_raw.data[:] = np.rollaxis(data.mean(axis=3), 0, 3) vis_raw.h2d() # Send it to GPU memory grid.conjugate(vis_raw) # some prep if kalman filter is to be applied if st.prefs.searchtype in ['imagek']: # TODO: check that this is ok if pointing at bright source spec_std = data.real.mean(axis=1).mean(axis=2).std(axis=0) sig_ts, kalman_coeffs = kalman_prepare_coeffs(spec_std) if not np.all(sig_ts): logger.info("sig_ts all zeros. Skipping search.") return candidates.CandCollection(prefs=st.prefs, metadata=st.metadata) # place to hold intermediate result lists canddict = {} canddict['candloc'] = [] for feat in st.features: canddict[feat] = [] for dtind in range(len(st.dtarr)): if dtind > 0: grid.downsample(vis_raw) for dmind in range(len(st.dmarr)): delay = util.calc_delay(st.freq, st.freq.max(), st.dmarr[dmind], st.inttime) grid.set_shift(delay >> dtind) # dispersion shift per chan in samples integrations = st.get_search_ints(segment, dmind, dtind) if len(integrations) == 0: continue minint = min(integrations) maxint = max(integrations) logger.info('Imaging {0} ints ({1}-{2}) in seg {3} at DM/dt {4:.1f}/{5}' ' with image {6}x{7} (uvres {8}) with gpu {9}' .format(len(integrations), minint, maxint, segment, st.dmarr[dmind], st.dtarr[dtind], st.npixx, st.npixy, st.uvres, devicenum)) for i in integrations: # grid and FFT grid.operate(vis_raw, vis_grid, i) image.operate(vis_grid, img_grid) # calc snr stats = image.stats(img_grid) if stats['rms'] != 0.: snr1 = stats['max']/stats['rms'] else: snr1 = 0. logger.warn("rfgpu rms is 0 in int {0}. Skipping.".format(i)) # threshold image if snr1 > st.prefs.sigma_image1: candloc = (segment, i, dmind, dtind, beamnum) xpeak = stats['xpeak'] ypeak = stats['ypeak'] l1, m1 = st.pixtolm((xpeak+st.npixx//2, ypeak+st.npixy//2)) if st.prefs.searchtype == 'image': logger.info("Got one! SNR1 {0:.1f} candidate at {1} and (l, m) = ({2},{3})" .format(snr1, candloc, l1, m1)) canddict['candloc'].append(candloc) canddict['l1'].append(l1) canddict['m1'].append(m1) canddict['snr1'].append(snr1) canddict['immax1'].append(stats['max']) elif st.prefs.searchtype == 'imagek': # TODO: implement phasing on GPU data_corr = dedisperseresample(data, delay, st.dtarr[dtind], parallel=st.prefs.nthread > 1, resamplefirst=st.fftmode=='cuda') spec = data_corr.take([i], axis=0) util.phase_shift(spec, uvw, l1, m1) spec = spec[0].real.mean(axis=2).mean(axis=0) # TODO: this significance can be biased low if averaging in long baselines that are not phased well # TODO: spec should be calculated from baselines used to measure l,m? significance_kalman = kalman_significance(spec, spec_std, sig_ts=sig_ts, coeffs=kalman_coeffs) snrk = (2*significance_kalman)**0.5 snrtot = (snrk**2 + snr1**2)**0.5 if snrtot > (st.prefs.sigma_kalman**2 + st.prefs.sigma_image1**2)**0.5: logger.info("Got one! SNR1 {0:.1f} and SNRk {1:.1f} candidate at {2} and (l,m) = ({3},{4})" .format(snr1, snrk, candloc, l1, m1)) canddict['candloc'].append(candloc) canddict['l1'].append(l1) canddict['m1'].append(m1) canddict['snr1'].append(snr1) canddict['immax1'].append(stats['max']) canddict['snrk'].append(snrk) elif st.prefs.searchtype == 'armkimage': raise NotImplementedError elif st.prefs.searchtype == 'armk': raise NotImplementedError else: logger.warn("searchtype {0} not recognized" .format(st.prefs.searchtype)) cc = candidates.make_candcollection(st, **canddict) logger.info("First pass found {0} candidates in seg {1}." .format(len(cc), segment)) if st.prefs.clustercands is not None: cc = candidates.cluster_candidates(cc) if st.prefs.savecands or st.prefs.saveplots: # triggers optional plotting and saving cc = reproduce_candcollection(cc, data) candidates.save_cands(st, candcollection=cc) return cc
def dedisperse_search_fftw(st, segment, data, wisdom=None): """ Fuse the dediserpse, resample, search, threshold functions. Returns list of CandData objects that define candidates with candloc, image, and phased visibility data. Integrations can define subset of all available in data to search. Default will take integrations not searched in neighboring segments. ** only supports threshold > image max (no min) ** dmind, dtind, beamnum assumed to represent current state of data """ if not np.any(data): logger.info("Data is all zeros. Skipping search.") return candidates.CandCollection(prefs=st.prefs, metadata=st.metadata) # some prep if kalman filter is to be applied if st.prefs.searchtype in ['imagek', 'armk', 'armkimage']: # TODO: check that this is ok if pointing at bright source spec_std = data.real.mean(axis=1).mean(axis=2).std(axis=0) sig_ts, kalman_coeffs = kalman_prepare_coeffs(spec_std) beamnum = 0 uvw = util.get_uvw_segment(st, segment) # place to hold intermediate result lists canddict = {} canddict['candloc'] = [] for feat in st.features: canddict[feat] = [] for dtind in range(len(st.dtarr)): for dmind in range(len(st.dmarr)): # set search integrations integrations = st.get_search_ints(segment, dmind, dtind) if len(integrations) == 0: continue minint = min(integrations) maxint = max(integrations) logger.info('{0} search of {1} ints ({2}-{3}) in seg {4} at DM/dt ' '{5:.1f}/{6} with image {7}x{8} (uvres {9}) with fftw' .format(st.prefs.searchtype, len(integrations), minint, maxint, segment, st.dmarr[dmind], st.dtarr[dtind], st.npixx, st.npixy, st.uvres)) # correct data delay = util.calc_delay(st.freq, st.freq.max(), st.dmarr[dmind], st.inttime) data_corr = dedisperseresample(data, delay, st.dtarr[dtind], parallel=st.prefs.nthread > 1, resamplefirst=st.fftmode=='cuda') # run search if st.prefs.searchtype in ['image', 'imagek']: images = grid_image(data_corr, uvw, st.npixx, st.npixy, st.uvres, 'fftw', st.prefs.nthread, wisdom=wisdom, integrations=integrations) for i, image in enumerate(images): immax1 = image.max() snr1 = immax1/image.std() if snr1 > st.prefs.sigma_image1: candloc = (segment, integrations[i], dmind, dtind, beamnum) l1, m1 = st.pixtolm(np.where(image == immax1)) # if set, use sigma_kalman as second stage filter if st.prefs.searchtype == 'imagek': spec = data_corr.take([integrations[i]], axis=0) util.phase_shift(spec, uvw, l1, m1) spec = spec[0].real.mean(axis=2).mean(axis=0) # TODO: this significance can be biased low if averaging in long baselines that are not phased well # TODO: spec should be calculated from baselines used to measure l,m? significance_kalman = kalman_significance(spec, spec_std, sig_ts=sig_ts, coeffs=kalman_coeffs) snrk = (2*significance_kalman)**0.5 snrtot = (snrk**2 + snr1**2)**0.5 if snrtot > (st.prefs.sigma_kalman**2 + st.prefs.sigma_image1**2)**0.5: logger.info("Got one! SNR1 {0:.1f} and SNRk {1:.1f} candidate at {2} and (l,m) = ({3},{4})" .format(snr1, snrk, candloc, l1, m1)) canddict['candloc'].append(candloc) canddict['l1'].append(l1) canddict['m1'].append(m1) canddict['snr1'].append(snr1) canddict['immax1'].append(immax1) canddict['snrk'].append(snrk) elif st.prefs.searchtype == 'image': logger.info("Got one! SNR1 {0:.1f} candidate at {1} and (l, m) = ({2},{3})" .format(snr1, candloc, l1, m1)) canddict['candloc'].append(candloc) canddict['l1'].append(l1) canddict['m1'].append(m1) canddict['snr1'].append(snr1) canddict['immax1'].append(immax1) elif st.prefs.searchtype in ['armkimage', 'armk']: armk_candidates = search_thresh_armk(st, data_corr, uvw, integrations=integrations, spec_std=spec_std, sig_ts=sig_ts, coeffs=kalman_coeffs) for candind, snrarms, snrk, armloc, peakxy, lm in armk_candidates: candloc = (segment, candind, dmind, dtind, beamnum) # if set, use sigma_kalman as second stage filter if st.prefs.searchtype == 'armkimage': image = grid_image(data_corr, uvw, st.npixx_full, st.npixy_full, st.uvres, 'fftw', st.prefs.nthread, wisdom=wisdom, integrations=candind) peakx, peaky = np.where(image[0] == image[0].max()) l1, m1 = st.calclm(st.npixx_full, st.npixy_full, st.uvres, peakx[0], peaky[0]) immax1 = image.max() snr1 = immax1/image.std() if snr1 > st.prefs.sigma_image1: logger.info("Got one! SNRarms {0:.1f} and SNRk " "{1:.1f} and SNR1 {2:.1f} candidate at" " {3} and (l,m) = ({4},{5})" .format(snrarms, snrk, snr1, candloc, l1, m1)) canddict['candloc'].append(candloc) canddict['l1'].append(l1) canddict['m1'].append(m1) canddict['snrarms'].append(snrarms) canddict['snrk'].append(snrk) canddict['snr1'].append(snr1) canddict['immax1'].append(immax1) elif st.prefs.searchtype == 'armk': l1, m1 = lm logger.info("Got one! SNRarms {0:.1f} and SNRk {1:.1f} " "candidate at {2} and (l,m) = ({3},{4})" .format(snrarms, snrk, candloc, l1, m1)) canddict['candloc'].append(candloc) canddict['l1'].append(l1) canddict['m1'].append(m1) canddict['snrarms'].append(snrarms) canddict['snrk'].append(snrk) else: raise NotImplemented("only searchtype=image, imagek, armk, armkimage implemented") cc = candidates.make_candcollection(st, **canddict) logger.info("First pass found {0} candidates in seg {1}." .format(len(cc), segment)) if st.prefs.clustercands is not None: cc = candidates.cluster_candidates(cc) if st.prefs.savecands or st.prefs.saveplots: # triggers optional plotting and saving cc = reproduce_candcollection(cc, data) candidates.save_cands(st, candcollection=cc) return cc
def oldcands_readone(candsfile, scan=None): """ Reads old-style candidate files to create new state and candidate collection for a given scan. Parsing merged cands file requires sdm locally with bdf for given scan. If no scan provided, assumes candsfile is from single scan not merged. """ from rfpipe import preferences, metadata, state, candidates with open(candsfile, 'rb') as pkl: try: d = pickle.load(pkl) ret = pickle.load(pkl) except UnicodeDecodeError: d = pickle.load(pkl, encoding='latin-1') ret = pickle.load(pkl, encoding='latin-1') if isinstance(ret, tuple): loc, prop = ret elif isinstance(ret, dict): loc = np.array(list(ret.keys())) prop = np.array(list(ret.values())) else: logger.warning( "Not sure what we've got in this here cands pkl file...") # detect merged vs nonmerged if 'scan' in d['featureind']: locind0 = 1 else: locind0 = 0 # merged candsfiles must be called with scan arg if scan is None: assert locind0 == 0, "Set scan if candsfile has multiple scans." inprefs = preferences.oldstate_preferences(d, scan=scan) inprefs.pop('gainfile') inprefs.pop('workdir') inprefs.pop('fileroot') inprefs['segmenttimes'] = inprefs['segmenttimes'] sdmfile = os.path.basename(d['filename']) try: assert scan is not None st = state.State(sdmfile=sdmfile, sdmscan=scan, inprefs=inprefs) except: meta = metadata.oldstate_metadata(d, scan=scan) st = state.State(inmeta=meta, inprefs=inprefs, showsummary=False) if 'rtpipe_version' in d: st.rtpipe_version = float(d['rtpipe_version']) # TODO test this if st.rtpipe_version <= 1.54: logger.info('Candidates detected with rtpipe version {0}. All ' 'versions <=1.54 used incorrect DM scaling.'.format( st.rtpipe_version)) if scan is None: assert locind0 == 0, "Set scan if candsfile has multiple scans." scan = d['scan'] logger.info('Calculating candidate properties for scan {0}'.format(scan)) if locind0 == 1: loc = loc[np.where(loc[:, 0] == scan)][:, locind0:] print(st.features, st.prefs.searchtype) fields = [str(ff) for ff in st.search_dimensions + st.features] types = [ str(tt) for tt in len(st.search_dimensions) * ['<i4'] + len(st.features) * ['<f4'] ] dtype = np.dtype({'names': fields, 'formats': types}) features = np.zeros(len(loc), dtype=dtype) for i in range(len(loc)): features[i] = tuple(list(loc[i]) + list(prop[i])) cc = candidates.CandCollection(features, st.prefs, st.metadata) return st, cc
def reproduce_candcollection(cc, data=None, wisdom=None, spec_std=None, sig_ts=[], kalman_coeffs=[]): """ Uses candcollection to make new candcollection with required info. Will look for cluster label and filter only for peak snr, if available. Location (e.g., integration, dm, dt) of each is used to create canddata for each candidate, if required. Can calculates features not used directly for search (as defined in state.prefs.calcfeatures). """ from rfpipe import candidates, util # set up output cc st = cc.state cc1 = candidates.CandCollection(prefs=st.prefs, metadata=st.metadata) if len(cc): if 'cluster' in cc.array.dtype.fields: clusters = cc.array['cluster'].astype(int) cl_rank, cl_count = candidates.calc_cluster_rank(cc) calcinds = np.unique(np.where(cl_rank == 1)[0]).tolist() logger.debug("Reproducing cands at {0} cluster peaks".format( len(calcinds))) else: logger.debug("No cluster field found. Reproducing all.") calcinds = list(range(len(cc))) # if candidates that need new feature calculations if not all([f in cc.array.dtype.fields for f in st.features]): logger.info("Generating canddata for {0} candidates".format( len(calcinds))) candlocs = cc.locs snrs = cc.snrtot normprob = candidates.normprob(snrs, st.ntrials) snrmax = snrs.max() logger.info('Zscore/SNR for strongest candidate: {0}/{1}'.format( normprob[np.where(snrs == snrmax)[0]][0], snrmax)) if ('snrk' in st.features and 'snrk' not in cc.array.dtype.fields and (spec_std is None or not len(sig_ts) or not len(kalman_coeffs))): # TODO: use same kalman calc for search as reproduce? spec_std, sig_ts, kalman_coeffs = util.kalman_prep(data) # reproduce canddata for each for i in calcinds: # TODO: check on best way to find max SNR with kalman, etc snr = snrs[i] candloc = candlocs[i] # kwargs passed to canddata object for plotting/saving kwargs = {} if 'cluster' in cc.array.dtype.fields: logger.info( "Cluster {0}/{1} has {2} candidates and max detected SNR {3:.1f} at {4}" .format(calcinds.index(i), len(calcinds) - 1, cl_count[i], snr, candloc)) # add supplementary plotting and cc info kwargs['cluster'] = clusters[i] kwargs['clustersize'] = cl_count[i] else: logger.info( "Candidate {0}/{1} has detected SNR {2:.1f} at {3}". format(calcinds.index(i), len(calcinds) - 1, snr, candloc)) # reproduce candidate and get/calc features data_corr = pipeline_datacorrect(st, candloc, data_prep=data) for feature in st.features: if feature in cc.array.dtype.fields: # if already calculated kwargs[feature] = cc.array[feature][i] else: # if desired, but not calculated here or from canddata if feature == 'snrk': if 'snrk' not in cc.array.dtype.fields: spec = data_corr.real.mean(axis=3).mean( axis=1)[candloc[1]] if np.count_nonzero(spec) / len( spec) > 1 - st.prefs.max_zerofrac: significance_kalman = -kalman_significance( spec, spec_std, sig_ts=sig_ts, coeffs=kalman_coeffs) snrk = (2 * significance_kalman)**0.5 else: logger.warning( "snrk set to 0, since {0}/{1} are zeroed" .format( len(spec) - np.count_nonzero(spec), len(spec))) snrk = 0. logger.info( "Calculated snrk of {0} after detection. " "Adding it to CandData.".format(snrk)) kwargs[feature] = snrk cd = pipeline_canddata(st, candloc, data_corr, spec_std=spec_std, sig_ts=sig_ts, kalman_coeffs=kalman_coeffs, **kwargs) if st.prefs.saveplots: candidates.candplot(cd, snrs=snrs) # snrs before clustering # regenerate cc with extra features in cd cc1 += candidates.cd_to_cc(cd) # if candidates that do not need new featuers, just select peaks else: logger.info( "Using clustering info to select {0} candidates".format( len(calcinds))) cc1.array = cc.array.take(calcinds) return cc1