def get_varfeatures(lcfile, outdir, timecols=None, magcols=None, errcols=None, mindet=1000, lcformat='hat-sql', lcformatdir=None): '''This runs :py:func:`astrobase.varclass.varfeatures.all_nonperiodic_features` on a single LC file. Parameters ---------- lcfile : str The input light curve to process. outfile : str The filename of the output variable features pickle that will be generated. timecols : list of str or None The timecol keys to use from the lcdict in calculating the features. magcols : list of str or None The magcol keys to use from the lcdict in calculating the features. errcols : list of str or None The errcol keys to use from the lcdict in calculating the features. mindet : int The minimum number of LC points required to generate variability features. lcformat : str This is the `formatkey` associated with your light curve format, which you previously passed in to the `lcproc.register_lcformat` function. This will be used to look up how to find and read the light curves specified in `basedir` or `use_list_of_filenames`. lcformatdir : str or None If this is provided, gives the path to a directory when you've stored your lcformat description JSONs, other than the usual directories lcproc knows to search for them in. Use this along with `lcformat` to specify an LC format JSON file that's not currently registered with lcproc. Returns ------- str The generated variability features pickle for the input LC, with results for each magcol in the input `magcol` or light curve format's default `magcol` list. ''' try: formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir) if formatinfo: (dfileglob, readerfunc, dtimecols, dmagcols, derrcols, magsarefluxes, normfunc) = formatinfo else: LOGERROR("can't figure out the light curve format") return None except Exception: LOGEXCEPTION("can't figure out the light curve format") return None # override the default timecols, magcols, and errcols # using the ones provided to the function if timecols is None: timecols = dtimecols if magcols is None: magcols = dmagcols if errcols is None: errcols = derrcols try: # get the LC into a dict lcdict = readerfunc(lcfile) # this should handle lists/tuples being returned by readerfunc # we assume that the first element is the actual lcdict # FIXME: figure out how to not need this assumption if ((isinstance(lcdict, (list, tuple))) and (isinstance(lcdict[0], dict))): lcdict = lcdict[0] resultdict = { 'objectid': lcdict['objectid'], 'info': lcdict['objectinfo'], 'lcfbasename': os.path.basename(lcfile) } # normalize using the special function if specified if normfunc is not None: lcdict = normfunc(lcdict) for tcol, mcol, ecol in zip(timecols, magcols, errcols): # dereference the columns and get them from the lcdict if '.' in tcol: tcolget = tcol.split('.') else: tcolget = [tcol] times = _dict_get(lcdict, tcolget) if '.' in mcol: mcolget = mcol.split('.') else: mcolget = [mcol] mags = _dict_get(lcdict, mcolget) if '.' in ecol: ecolget = ecol.split('.') else: ecolget = [ecol] errs = _dict_get(lcdict, ecolget) # normalize here if not using special normalization if normfunc is None: ntimes, nmags = normalize_magseries( times, mags, magsarefluxes=magsarefluxes) times, mags, errs = ntimes, nmags, errs # make sure we have finite values finind = np.isfinite(times) & np.isfinite(mags) & np.isfinite(errs) # make sure we have enough finite values if mags[finind].size < mindet: LOGINFO('not enough LC points: %s in normalized %s LC: %s' % (mags[finind].size, mcol, os.path.basename(lcfile))) resultdict[mcol] = None else: # get the features for this magcol lcfeatures = varfeatures.all_nonperiodic_features( times, mags, errs) resultdict[mcol] = lcfeatures # now that we've collected all the magcols, we can choose which is the # "best" magcol. this is defined as the magcol that gives us the # smallest LC MAD. try: magmads = np.zeros(len(magcols)) for mind, mcol in enumerate(magcols): if '.' in mcol: mcolget = mcol.split('.') else: mcolget = [mcol] magmads[mind] = resultdict[mcol]['mad'] # smallest MAD index bestmagcolind = np.where(magmads == np.min(magmads))[0] resultdict['bestmagcol'] = magcols[bestmagcolind] except Exception: resultdict['bestmagcol'] = None outfile = os.path.join( outdir, 'varfeatures-%s.pkl' % squeeze(resultdict['objectid']).replace(' ', '-')) with open(outfile, 'wb') as outfd: pickle.dump(resultdict, outfd, protocol=4) return outfile except Exception as e: LOGEXCEPTION('failed to get LC features for %s because: %s' % (os.path.basename(lcfile), e)) return None
def parallel_periodicfeatures_lcdir( pfpkl_dir, lcbasedir, outdir, pfpkl_glob='periodfinding-*.pkl*', starfeaturesdir=None, fourierorder=5, # these are depth, duration, ingress duration transitparams=(-0.01, 0.1, 0.1), # these are depth, duration, depth ratio, secphase ebparams=(-0.2, 0.3, 0.7, 0.5), pdiff_threshold=1.0e-4, sidereal_threshold=1.0e-4, sampling_peak_multiplier=5.0, sampling_startp=None, sampling_endp=None, timecols=None, magcols=None, errcols=None, lcformat='hat-sql', lcformatdir=None, sigclip=10.0, verbose=False, maxobjects=None, nworkers=NCPUS, recursive=True, ): '''This runs parallel periodicfeature extraction for a directory of periodfinding result pickles. Parameters ---------- pfpkl_dir : str The directory containing the pickles to process. lcbasedir : str The directory where all of the associated light curve files are located. outdir : str The directory where all the output will be written. pfpkl_glob : str The UNIX file glob to use to search for period-finder result pickles in `pfpkl_dir`. starfeaturesdir : str or None The directory containing the `starfeatures-<objectid>.pkl` files for each object to use calculate neighbor proximity light curve features. fourierorder : int The Fourier order to use to generate sinusoidal function and fit that to the phased light curve. transitparams : list of floats The transit depth, duration, and ingress duration to use to generate a trapezoid planet transit model fit to the phased light curve. The period used is the one provided in `period`, while the epoch is automatically obtained from a spline fit to the phased light curve. ebparams : list of floats The primary eclipse depth, eclipse duration, the primary-secondary depth ratio, and the phase of the secondary eclipse to use to generate an eclipsing binary model fit to the phased light curve. The period used is the one provided in `period`, while the epoch is automatically obtained from a spline fit to the phased light curve. pdiff_threshold : float This is the max difference between periods to consider them the same. sidereal_threshold : float This is the max difference between any of the 'best' periods and the sidereal day periods to consider them the same. sampling_peak_multiplier : float This is the minimum multiplicative factor of a 'best' period's normalized periodogram peak over the sampling periodogram peak at the same period required to accept the 'best' period as possibly real. sampling_startp, sampling_endp : float If the `pgramlist` doesn't have a time-sampling Lomb-Scargle periodogram, it will be obtained automatically. Use these kwargs to control the minimum and maximum period interval to be searched when generating this periodogram. timecols : list of str or None The timecol keys to use from the lcdict in calculating the features. magcols : list of str or None The magcol keys to use from the lcdict in calculating the features. errcols : list of str or None The errcol keys to use from the lcdict in calculating the features. lcformat : str This is the `formatkey` associated with your light curve format, which you previously passed in to the `lcproc.register_lcformat` function. This will be used to look up how to find and read the light curves specified in `basedir` or `use_list_of_filenames`. lcformatdir : str or None If this is provided, gives the path to a directory when you've stored your lcformat description JSONs, other than the usual directories lcproc knows to search for them in. Use this along with `lcformat` to specify an LC format JSON file that's not currently registered with lcproc. sigclip : float or int or sequence of two floats/ints or None If a single float or int, a symmetric sigma-clip will be performed using the number provided as the sigma-multiplier to cut out from the input time-series. If a list of two ints/floats is provided, the function will perform an 'asymmetric' sigma-clip. The first element in this list is the sigma value to use for fainter flux/mag values; the second element in this list is the sigma value to use for brighter flux/mag values. For example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma dimmings and greater than 3-sigma brightenings. Here the meaning of "dimming" and "brightening" is set by *physics* (not the magnitude system), which is why the `magsarefluxes` kwarg must be correctly set. If `sigclip` is None, no sigma-clipping will be performed, and the time-series (with non-finite elems removed) will be passed through to the output. verbose : bool If True, will indicate progress while working. maxobjects : int The total number of objects to process from `pfpkl_list`. nworkers : int The number of parallel workers to launch to process the input. Returns ------- dict A dict containing key: val pairs of the input period-finder result and the output periodic feature result pickles for each input pickle is returned. ''' try: formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir) if formatinfo: (dfileglob, readerfunc, dtimecols, dmagcols, derrcols, magsarefluxes, normfunc) = formatinfo else: LOGERROR("can't figure out the light curve format") return None except Exception: LOGEXCEPTION("can't figure out the light curve format") return None fileglob = pfpkl_glob # now find the files LOGINFO('searching for periodfinding pickles in %s ...' % pfpkl_dir) if recursive is False: matching = glob.glob(os.path.join(pfpkl_dir, fileglob)) else: matching = glob.glob(os.path.join(pfpkl_dir, '**', fileglob), recursive=True) # now that we have all the files, process them if matching and len(matching) > 0: LOGINFO('found %s periodfinding pickles, getting periodicfeatures...' % len(matching)) return parallel_periodicfeatures( matching, lcbasedir, outdir, starfeaturesdir=starfeaturesdir, fourierorder=fourierorder, transitparams=transitparams, ebparams=ebparams, pdiff_threshold=pdiff_threshold, sidereal_threshold=sidereal_threshold, sampling_peak_multiplier=sampling_peak_multiplier, sampling_startp=sampling_startp, sampling_endp=sampling_endp, timecols=timecols, magcols=magcols, errcols=errcols, lcformat=lcformat, lcformatdir=lcformatdir, sigclip=sigclip, verbose=verbose, maxobjects=maxobjects, nworkers=nworkers, ) else: LOGERROR('no periodfinding pickles found in %s' % (pfpkl_dir)) return None
def parallel_varfeatures_lcdir(lcdir, outdir, fileglob=None, maxobjects=None, timecols=None, magcols=None, errcols=None, recursive=True, mindet=1000, lcformat='hat-sql', lcformatdir=None, nworkers=NCPUS): '''This runs parallel variable feature extraction for a directory of LCs. Parameters ---------- lcdir : str The directory of light curve files to process. outdir : str The directory where the output varfeatures pickle files will be written. fileglob : str or None The file glob to use when looking for light curve files in `lcdir`. If None, the default file glob associated for this LC format will be used. maxobjects : int The number of LCs to process from `lclist`. timecols : list of str or None The timecol keys to use from the lcdict in calculating the features. magcols : list of str or None The magcol keys to use from the lcdict in calculating the features. errcols : list of str or None The errcol keys to use from the lcdict in calculating the features. mindet : int The minimum number of LC points required to generate variability features. lcformat : str This is the `formatkey` associated with your light curve format, which you previously passed in to the `lcproc.register_lcformat` function. This will be used to look up how to find and read the light curves specified in `basedir` or `use_list_of_filenames`. lcformatdir : str or None If this is provided, gives the path to a directory when you've stored your lcformat description JSONs, other than the usual directories lcproc knows to search for them in. Use this along with `lcformat` to specify an LC format JSON file that's not currently registered with lcproc. nworkers : int The number of parallel workers to launch. Returns ------- dict A dict with key:val pairs of input LC file name : the generated variability features pickles for each of the input LCs, with results for each magcol in the input `magcol` or light curve format's default `magcol` list. ''' try: formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir) if formatinfo: (dfileglob, readerfunc, dtimecols, dmagcols, derrcols, magsarefluxes, normfunc) = formatinfo else: LOGERROR("can't figure out the light curve format") return None except Exception: LOGEXCEPTION("can't figure out the light curve format") return None if not fileglob: fileglob = dfileglob # now find the files LOGINFO('searching for %s light curves in %s ...' % (lcformat, lcdir)) if recursive is False: matching = glob.glob(os.path.join(lcdir, fileglob)) else: matching = glob.glob(os.path.join(lcdir, '**', fileglob), recursive=True) # now that we have all the files, process them if matching and len(matching) > 0: LOGINFO('found %s light curves, getting varfeatures...' % len(matching)) return parallel_varfeatures(matching, outdir, maxobjects=maxobjects, timecols=timecols, magcols=magcols, errcols=errcols, mindet=mindet, lcformat=lcformat, lcformatdir=lcformatdir, nworkers=nworkers) else: LOGERROR('no light curve files in %s format found in %s' % (lcformat, lcdir)) return None
def serial_periodicfeatures( pfpkl_list, lcbasedir, outdir, starfeaturesdir=None, fourierorder=5, # these are depth, duration, ingress duration transitparams=(-0.01, 0.1, 0.1), # these are depth, duration, depth ratio, secphase ebparams=(-0.2, 0.3, 0.7, 0.5), pdiff_threshold=1.0e-4, sidereal_threshold=1.0e-4, sampling_peak_multiplier=5.0, sampling_startp=None, sampling_endp=None, starfeatures=None, timecols=None, magcols=None, errcols=None, lcformat='hat-sql', lcformatdir=None, sigclip=10.0, verbose=False, maxobjects=None): '''This drives the periodicfeatures collection for a list of periodfinding pickles. Parameters ---------- pfpkl_list : list of str The list of period-finding pickles to use. lcbasedir : str The base directory where the associated light curves are located. outdir : str The directory where the results will be written. starfeaturesdir : str or None The directory containing the `starfeatures-<objectid>.pkl` files for each object to use calculate neighbor proximity light curve features. fourierorder : int The Fourier order to use to generate sinusoidal function and fit that to the phased light curve. transitparams : list of floats The transit depth, duration, and ingress duration to use to generate a trapezoid planet transit model fit to the phased light curve. The period used is the one provided in `period`, while the epoch is automatically obtained from a spline fit to the phased light curve. ebparams : list of floats The primary eclipse depth, eclipse duration, the primary-secondary depth ratio, and the phase of the secondary eclipse to use to generate an eclipsing binary model fit to the phased light curve. The period used is the one provided in `period`, while the epoch is automatically obtained from a spline fit to the phased light curve. pdiff_threshold : float This is the max difference between periods to consider them the same. sidereal_threshold : float This is the max difference between any of the 'best' periods and the sidereal day periods to consider them the same. sampling_peak_multiplier : float This is the minimum multiplicative factor of a 'best' period's normalized periodogram peak over the sampling periodogram peak at the same period required to accept the 'best' period as possibly real. sampling_startp, sampling_endp : float If the `pgramlist` doesn't have a time-sampling Lomb-Scargle periodogram, it will be obtained automatically. Use these kwargs to control the minimum and maximum period interval to be searched when generating this periodogram. timecols : list of str or None The timecol keys to use from the lcdict in calculating the features. magcols : list of str or None The magcol keys to use from the lcdict in calculating the features. errcols : list of str or None The errcol keys to use from the lcdict in calculating the features. lcformat : str This is the `formatkey` associated with your light curve format, which you previously passed in to the `lcproc.register_lcformat` function. This will be used to look up how to find and read the light curves specified in `basedir` or `use_list_of_filenames`. lcformatdir : str or None If this is provided, gives the path to a directory when you've stored your lcformat description JSONs, other than the usual directories lcproc knows to search for them in. Use this along with `lcformat` to specify an LC format JSON file that's not currently registered with lcproc. sigclip : float or int or sequence of two floats/ints or None If a single float or int, a symmetric sigma-clip will be performed using the number provided as the sigma-multiplier to cut out from the input time-series. If a list of two ints/floats is provided, the function will perform an 'asymmetric' sigma-clip. The first element in this list is the sigma value to use for fainter flux/mag values; the second element in this list is the sigma value to use for brighter flux/mag values. For example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma dimmings and greater than 3-sigma brightenings. Here the meaning of "dimming" and "brightening" is set by *physics* (not the magnitude system), which is why the `magsarefluxes` kwarg must be correctly set. If `sigclip` is None, no sigma-clipping will be performed, and the time-series (with non-finite elems removed) will be passed through to the output. verbose : bool If True, will indicate progress while working. maxobjects : int The total number of objects to process from `pfpkl_list`. Returns ------- Nothing. ''' try: formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir) if formatinfo: (fileglob, readerfunc, dtimecols, dmagcols, derrcols, magsarefluxes, normfunc) = formatinfo else: LOGERROR("can't figure out the light curve format") return None except Exception: LOGEXCEPTION("can't figure out the light curve format") return None # make sure to make the output directory if it doesn't exist if not os.path.exists(outdir): os.makedirs(outdir) if maxobjects: pfpkl_list = pfpkl_list[:maxobjects] LOGINFO('%s periodfinding pickles to process' % len(pfpkl_list)) # if the starfeaturedir is provided, try to find a starfeatures pickle for # each periodfinding pickle in pfpkl_list if starfeaturesdir and os.path.exists(starfeaturesdir): starfeatures_list = [] LOGINFO('collecting starfeatures pickles...') for pfpkl in pfpkl_list: sfpkl1 = os.path.basename(pfpkl).replace('periodfinding', 'starfeatures') sfpkl2 = sfpkl1.replace('.gz', '') sfpath1 = os.path.join(starfeaturesdir, sfpkl1) sfpath2 = os.path.join(starfeaturesdir, sfpkl2) if os.path.exists(sfpath1): starfeatures_list.append(sfpkl1) elif os.path.exists(sfpath2): starfeatures_list.append(sfpkl2) else: starfeatures_list.append(None) else: starfeatures_list = [None for x in pfpkl_list] # generate the task list kwargs = { 'fourierorder': fourierorder, 'transitparams': transitparams, 'ebparams': ebparams, 'pdiff_threshold': pdiff_threshold, 'sidereal_threshold': sidereal_threshold, 'sampling_peak_multiplier': sampling_peak_multiplier, 'sampling_startp': sampling_startp, 'sampling_endp': sampling_endp, 'timecols': timecols, 'magcols': magcols, 'errcols': errcols, 'lcformat': lcformat, 'lcformatdir': lcformatdir, 'sigclip': sigclip, 'verbose': verbose } tasks = [(x, lcbasedir, outdir, y, kwargs) for (x, y) in zip(pfpkl_list, starfeatures_list)] LOGINFO('processing periodfinding pickles...') for task in tqdm(tasks): _periodicfeatures_worker(task)
def get_periodicfeatures( pfpickle, lcbasedir, outdir, fourierorder=5, # these are depth, duration, ingress duration transitparams=(-0.01, 0.1, 0.1), # these are depth, duration, depth ratio, secphase ebparams=(-0.2, 0.3, 0.7, 0.5), pdiff_threshold=1.0e-4, sidereal_threshold=1.0e-4, sampling_peak_multiplier=5.0, sampling_startp=None, sampling_endp=None, starfeatures=None, timecols=None, magcols=None, errcols=None, lcformat='hat-sql', lcformatdir=None, sigclip=10.0, verbose=True, raiseonfail=False): '''This gets all periodic features for the object. Parameters ---------- pfpickle : str The period-finding result pickle containing period-finder results to use for the calculation of LC fit, periodogram, and phased LC features. lcbasedir : str The base directory where the light curve for the current object is located. outdir : str The output directory where the results will be written. fourierorder : int The Fourier order to use to generate sinusoidal function and fit that to the phased light curve. transitparams : list of floats The transit depth, duration, and ingress duration to use to generate a trapezoid planet transit model fit to the phased light curve. The period used is the one provided in `period`, while the epoch is automatically obtained from a spline fit to the phased light curve. ebparams : list of floats The primary eclipse depth, eclipse duration, the primary-secondary depth ratio, and the phase of the secondary eclipse to use to generate an eclipsing binary model fit to the phased light curve. The period used is the one provided in `period`, while the epoch is automatically obtained from a spline fit to the phased light curve. pdiff_threshold : float This is the max difference between periods to consider them the same. sidereal_threshold : float This is the max difference between any of the 'best' periods and the sidereal day periods to consider them the same. sampling_peak_multiplier : float This is the minimum multiplicative factor of a 'best' period's normalized periodogram peak over the sampling periodogram peak at the same period required to accept the 'best' period as possibly real. sampling_startp, sampling_endp : float If the `pgramlist` doesn't have a time-sampling Lomb-Scargle periodogram, it will be obtained automatically. Use these kwargs to control the minimum and maximum period interval to be searched when generating this periodogram. starfeatures : str or None If not None, this should be the filename of the `starfeatures-<objectid>.pkl` created by :py:func:`astrobase.lcproc.lcsfeatures.get_starfeatures` for this object. This is used to get the neighbor's light curve and phase it with this object's period to see if this object is blended. timecols : list of str or None The timecol keys to use from the lcdict in calculating the features. magcols : list of str or None The magcol keys to use from the lcdict in calculating the features. errcols : list of str or None The errcol keys to use from the lcdict in calculating the features. lcformat : str This is the `formatkey` associated with your light curve format, which you previously passed in to the `lcproc.register_lcformat` function. This will be used to look up how to find and read the light curves specified in `basedir` or `use_list_of_filenames`. lcformatdir : str or None If this is provided, gives the path to a directory when you've stored your lcformat description JSONs, other than the usual directories lcproc knows to search for them in. Use this along with `lcformat` to specify an LC format JSON file that's not currently registered with lcproc. sigclip : float or int or sequence of two floats/ints or None If a single float or int, a symmetric sigma-clip will be performed using the number provided as the sigma-multiplier to cut out from the input time-series. If a list of two ints/floats is provided, the function will perform an 'asymmetric' sigma-clip. The first element in this list is the sigma value to use for fainter flux/mag values; the second element in this list is the sigma value to use for brighter flux/mag values. For example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma dimmings and greater than 3-sigma brightenings. Here the meaning of "dimming" and "brightening" is set by *physics* (not the magnitude system), which is why the `magsarefluxes` kwarg must be correctly set. If `sigclip` is None, no sigma-clipping will be performed, and the time-series (with non-finite elems removed) will be passed through to the output. verbose : bool If True, will indicate progress while working. raiseonfail : bool If True, will raise an Exception if something goes wrong. Returns ------- str Returns a filename for the output pickle containing all of the periodic features for the input object's LC. ''' try: formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir) if formatinfo: (fileglob, readerfunc, dtimecols, dmagcols, derrcols, magsarefluxes, normfunc) = formatinfo else: LOGERROR("can't figure out the light curve format") return None except Exception: LOGEXCEPTION("can't figure out the light curve format") return None # open the pfpickle if pfpickle.endswith('.gz'): infd = gzip.open(pfpickle) else: infd = open(pfpickle) pf = pickle.load(infd) infd.close() lcfile = os.path.join(lcbasedir, pf['lcfbasename']) objectid = pf['objectid'] if 'kwargs' in pf: kwargs = pf['kwargs'] else: kwargs = None # override the default timecols, magcols, and errcols # using the ones provided to the periodfinder # if those don't exist, use the defaults from the lcformat def if kwargs and 'timecols' in kwargs and timecols is None: timecols = kwargs['timecols'] elif not kwargs and not timecols: timecols = dtimecols if kwargs and 'magcols' in kwargs and magcols is None: magcols = kwargs['magcols'] elif not kwargs and not magcols: magcols = dmagcols if kwargs and 'errcols' in kwargs and errcols is None: errcols = kwargs['errcols'] elif not kwargs and not errcols: errcols = derrcols # check if the light curve file exists if not os.path.exists(lcfile): LOGERROR("can't find LC %s for object %s" % (lcfile, objectid)) return None # check if we have neighbors we can get the LCs for if starfeatures is not None and os.path.exists(starfeatures): with open(starfeatures, 'rb') as infd: starfeat = pickle.load(infd) if starfeat['closestnbrlcfname'].size > 0: nbr_full_lcf = starfeat['closestnbrlcfname'][0] # check for this LC in the lcbasedir if os.path.exists( os.path.join(lcbasedir, os.path.basename(nbr_full_lcf))): nbrlcf = os.path.join(lcbasedir, os.path.basename(nbr_full_lcf)) # if it's not there, check for this file at the full LC location elif os.path.exists(nbr_full_lcf): nbrlcf = nbr_full_lcf # otherwise, we can't find it, so complain else: LOGWARNING("can't find neighbor light curve file: %s in " "its original directory: %s, or in this object's " "lcbasedir: %s, skipping neighbor processing..." % (os.path.basename(nbr_full_lcf), os.path.dirname(nbr_full_lcf), lcbasedir)) nbrlcf = None else: nbrlcf = None else: nbrlcf = None # now, start processing for periodic feature extraction try: # get the object LC into a dict lcdict = readerfunc(lcfile) # this should handle lists/tuples being returned by readerfunc # we assume that the first element is the actual lcdict # FIXME: figure out how to not need this assumption if ((isinstance(lcdict, (list, tuple))) and (isinstance(lcdict[0], dict))): lcdict = lcdict[0] # get the nbr object LC into a dict if there is one if nbrlcf is not None: nbrlcdict = readerfunc(nbrlcf) # this should handle lists/tuples being returned by readerfunc # we assume that the first element is the actual lcdict # FIXME: figure out how to not need this assumption if ((isinstance(nbrlcdict, (list, tuple))) and (isinstance(nbrlcdict[0], dict))): nbrlcdict = nbrlcdict[0] # this will be the output file outfile = os.path.join( outdir, 'periodicfeatures-%s.pkl' % squeeze(objectid).replace(' ', '-')) # normalize using the special function if specified if normfunc is not None: lcdict = normfunc(lcdict) if nbrlcf: nbrlcdict = normfunc(nbrlcdict) resultdict = {} for tcol, mcol, ecol in zip(timecols, magcols, errcols): # dereference the columns and get them from the lcdict if '.' in tcol: tcolget = tcol.split('.') else: tcolget = [tcol] times = _dict_get(lcdict, tcolget) if nbrlcf: nbrtimes = _dict_get(nbrlcdict, tcolget) else: nbrtimes = None if '.' in mcol: mcolget = mcol.split('.') else: mcolget = [mcol] mags = _dict_get(lcdict, mcolget) if nbrlcf: nbrmags = _dict_get(nbrlcdict, mcolget) else: nbrmags = None if '.' in ecol: ecolget = ecol.split('.') else: ecolget = [ecol] errs = _dict_get(lcdict, ecolget) if nbrlcf: nbrerrs = _dict_get(nbrlcdict, ecolget) else: nbrerrs = None # # filter out nans, etc. from the object and any neighbor LC # # get the finite values finind = np.isfinite(times) & np.isfinite(mags) & np.isfinite(errs) ftimes, fmags, ferrs = times[finind], mags[finind], errs[finind] if nbrlcf: nfinind = (np.isfinite(nbrtimes) & np.isfinite(nbrmags) & np.isfinite(nbrerrs)) nbrftimes, nbrfmags, nbrferrs = (nbrtimes[nfinind], nbrmags[nfinind], nbrerrs[nfinind]) # get nonzero errors nzind = np.nonzero(ferrs) ftimes, fmags, ferrs = ftimes[nzind], fmags[nzind], ferrs[nzind] if nbrlcf: nnzind = np.nonzero(nbrferrs) nbrftimes, nbrfmags, nbrferrs = (nbrftimes[nnzind], nbrfmags[nnzind], nbrferrs[nnzind]) # normalize here if not using special normalization if normfunc is None: ntimes, nmags = normalize_magseries( ftimes, fmags, magsarefluxes=magsarefluxes) times, mags, errs = ntimes, nmags, ferrs if nbrlcf: nbrntimes, nbrnmags = normalize_magseries( nbrftimes, nbrfmags, magsarefluxes=magsarefluxes) nbrtimes, nbrmags, nbrerrs = nbrntimes, nbrnmags, nbrferrs else: nbrtimes, nbrmags, nbrerrs = None, None, None else: times, mags, errs = ftimes, fmags, ferrs if times.size > 999: # # now we have times, mags, errs (and nbrtimes, nbrmags, nbrerrs) # available_pfmethods = [] available_pgrams = [] available_bestperiods = [] for k in pf[mcol].keys(): if k in PFMETHODS: available_pgrams.append(pf[mcol][k]) if k != 'win': available_pfmethods.append(pf[mcol][k]['method']) available_bestperiods.append( pf[mcol][k]['bestperiod']) # # process periodic features for this magcol # featkey = 'periodicfeatures-%s' % mcol resultdict[featkey] = {} # first, handle the periodogram features pgramfeat = periodicfeatures.periodogram_features( available_pgrams, times, mags, errs, sigclip=sigclip, pdiff_threshold=pdiff_threshold, sidereal_threshold=sidereal_threshold, sampling_peak_multiplier=sampling_peak_multiplier, sampling_startp=sampling_startp, sampling_endp=sampling_endp, verbose=verbose) resultdict[featkey].update(pgramfeat) resultdict[featkey]['pfmethods'] = available_pfmethods # then for each bestperiod, get phasedlc and lcfit features for _ind, pfm, bp in zip(range(len(available_bestperiods)), available_pfmethods, available_bestperiods): resultdict[featkey][pfm] = periodicfeatures.lcfit_features( times, mags, errs, bp, fourierorder=fourierorder, transitparams=transitparams, ebparams=ebparams, sigclip=sigclip, magsarefluxes=magsarefluxes, verbose=verbose) phasedlcfeat = periodicfeatures.phasedlc_features( times, mags, errs, bp, nbrtimes=nbrtimes, nbrmags=nbrmags, nbrerrs=nbrerrs) resultdict[featkey][pfm].update(phasedlcfeat) else: LOGERROR('not enough finite measurements in magcol: %s, for ' 'pfpickle: %s, skipping this magcol' % (mcol, pfpickle)) featkey = 'periodicfeatures-%s' % mcol resultdict[featkey] = None # # end of per magcol processing # # write resultdict to pickle outfile = os.path.join( outdir, 'periodicfeatures-%s.pkl' % squeeze(objectid).replace(' ', '-')) with open(outfile, 'wb') as outfd: pickle.dump(resultdict, outfd, pickle.HIGHEST_PROTOCOL) return outfile except Exception: LOGEXCEPTION('failed to run for pf: %s, lcfile: %s' % (pfpickle, lcfile)) if raiseonfail: raise else: return None
def parallel_epd_lcdir(lcdir, externalparams, lcfileglob=None, timecols=None, magcols=None, errcols=None, lcformat='hat-sql', lcformatdir=None, epdsmooth_sigclip=3.0, epdsmooth_windowsize=21, epdsmooth_func=smooth_magseries_savgol, epdsmooth_extraparams=None, nworkers=NCPUS, maxworkertasks=1000): '''This applies EPD in parallel to all LCs in a directory. Parameters ---------- lcdir : str The light curve directory to process. externalparams : dict or None This is a dict that indicates which keys in the lcdict obtained from the lcfile correspond to the required external parameters. As with timecol, magcol, and errcol, these can be simple keys (e.g. 'rjd') or compound keys ('magaperture1.mags'). The dict should look something like:: {'fsv':'<lcdict key>' array: S values for each observation, 'fdv':'<lcdict key>' array: D values for each observation, 'fkv':'<lcdict key>' array: K values for each observation, 'xcc':'<lcdict key>' array: x coords for each observation, 'ycc':'<lcdict key>' array: y coords for each observation, 'bgv':'<lcdict key>' array: sky background for each observation, 'bge':'<lcdict key>' array: sky background err for each observation, 'iha':'<lcdict key>' array: hour angle for each observation, 'izd':'<lcdict key>' array: zenith distance for each observation} lcfileglob : str or None A UNIX fileglob to use to select light curve files in `lcdir`. If this is not None, the value provided will override the default fileglob for your light curve format. timecols,magcols,errcols : lists of str The keys in the lcdict produced by your light curve reader function that correspond to the times, mags/fluxes, and associated measurement errors that will be used as inputs to the EPD process. If these are None, the default values for `timecols`, `magcols`, and `errcols` for your light curve format will be used here. lcformat : str This is the `formatkey` associated with your light curve format, which you previously passed in to the `lcproc.register_lcformat` function. This will be used to look up how to find and read the light curves specified in `basedir` or `use_list_of_filenames`. lcformatdir : str or None If this is provided, gives the path to a directory when you've stored your lcformat description JSONs, other than the usual directories lcproc knows to search for them in. Use this along with `lcformat` to specify an LC format JSON file that's not currently registered with lcproc. epdsmooth_sigclip : float or int or sequence of two floats/ints or None This specifies how to sigma-clip the input LC before fitting the EPD function to it. If a single float or int, a symmetric sigma-clip will be performed using the number provided as the sigma-multiplier to cut out from the input time-series. If a list of two ints/floats is provided, the function will perform an 'asymmetric' sigma-clip. The first element in this list is the sigma value to use for fainter flux/mag values; the second element in this list is the sigma value to use for brighter flux/mag values. For example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma dimmings and greater than 3-sigma brightenings. Here the meaning of "dimming" and "brightening" is set by *physics* (not the magnitude system), which is why the `magsarefluxes` kwarg must be correctly set. If `sigclip` is None, no sigma-clipping will be performed, and the time-series (with non-finite elems removed) will be passed through to the output. epdsmooth_windowsize : int This is the number of LC points to smooth over to generate a smoothed light curve that will be used to fit the EPD function. epdsmooth_func : Python function This sets the smoothing filter function to use. A Savitsky-Golay filter is used to smooth the light curve by default. The functions that can be used with this kwarg are listed in `varbase.trends`. If you want to use your own function, it MUST have the following signature:: def smoothfunc(mags_array, window_size, **extraparams) and return a numpy array of the same size as `mags_array` with the smoothed time-series. Any extra params can be provided using the `extraparams` dict. epdsmooth_extraparams : dict This is a dict of any extra filter params to supply to the smoothing function. nworkers : int The number of parallel workers to launch when processing the LCs. maxworkertasks : int The maximum number of tasks a parallel worker will complete before it is replaced with a new one (sometimes helps with memory-leaks). Returns ------- dict Returns a dict organized by all the keys in the input `magcols` list, containing lists of EPD pickle light curves for that `magcol`. Notes ----- - S -> measure of PSF sharpness (~1/sigma^2 sosmaller S = wider PSF) - D -> measure of PSF ellipticity in xy direction - K -> measure of PSF ellipticity in cross direction S, D, K are related to the PSF's variance and covariance, see eqn 30-33 in A. Pal's thesis: https://arxiv.org/abs/0906.3486 ''' try: formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir) if formatinfo: (fileglob, readerfunc, dtimecols, dmagcols, derrcols, magsarefluxes, normfunc) = formatinfo else: LOGERROR("can't figure out the light curve format") return None except Exception as e: LOGEXCEPTION("can't figure out the light curve format") return None # find all the files matching the lcglob in lcdir if lcfileglob is None: lcfileglob = fileglob lclist = sorted(glob.glob(os.path.join(lcdir, lcfileglob))) return parallel_epd_lclist(lclist, externalparams, timecols=timecols, magcols=magcols, errcols=errcols, lcformat=lcformat, epdsmooth_sigclip=epdsmooth_sigclip, epdsmooth_windowsize=epdsmooth_windowsize, epdsmooth_func=epdsmooth_func, epdsmooth_extraparams=epdsmooth_extraparams, nworkers=nworkers, maxworkertasks=maxworkertasks)
def apply_epd_magseries(lcfile, timecol, magcol, errcol, externalparams, lcformat='hat-sql', lcformatdir=None, epdsmooth_sigclip=3.0, epdsmooth_windowsize=21, epdsmooth_func=smooth_magseries_savgol, epdsmooth_extraparams=None): '''This applies external parameter decorrelation (EPD) to a light curve. Parameters ---------- lcfile : str The filename of the light curve file to process. timecol,magcol,errcol : str The keys in the lcdict produced by your light curve reader function that correspond to the times, mags/fluxes, and associated measurement errors that will be used as input to the EPD process. externalparams : dict or None This is a dict that indicates which keys in the lcdict obtained from the lcfile correspond to the required external parameters. As with timecol, magcol, and errcol, these can be simple keys (e.g. 'rjd') or compound keys ('magaperture1.mags'). The dict should look something like:: {'fsv':'<lcdict key>' array: S values for each observation, 'fdv':'<lcdict key>' array: D values for each observation, 'fkv':'<lcdict key>' array: K values for each observation, 'xcc':'<lcdict key>' array: x coords for each observation, 'ycc':'<lcdict key>' array: y coords for each observation, 'bgv':'<lcdict key>' array: sky background for each observation, 'bge':'<lcdict key>' array: sky background err for each observation, 'iha':'<lcdict key>' array: hour angle for each observation, 'izd':'<lcdict key>' array: zenith distance for each observation} Alternatively, if these exact keys are already present in the lcdict, indicate this by setting externalparams to None. lcformat : str This is the `formatkey` associated with your light curve format, which you previously passed in to the `lcproc.register_lcformat` function. This will be used to look up how to find and read the light curves specified in `basedir` or `use_list_of_filenames`. lcformatdir : str or None If this is provided, gives the path to a directory when you've stored your lcformat description JSONs, other than the usual directories lcproc knows to search for them in. Use this along with `lcformat` to specify an LC format JSON file that's not currently registered with lcproc. epdsmooth_sigclip : float or int or sequence of two floats/ints or None This specifies how to sigma-clip the input LC before fitting the EPD function to it. If a single float or int, a symmetric sigma-clip will be performed using the number provided as the sigma-multiplier to cut out from the input time-series. If a list of two ints/floats is provided, the function will perform an 'asymmetric' sigma-clip. The first element in this list is the sigma value to use for fainter flux/mag values; the second element in this list is the sigma value to use for brighter flux/mag values. For example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma dimmings and greater than 3-sigma brightenings. Here the meaning of "dimming" and "brightening" is set by *physics* (not the magnitude system), which is why the `magsarefluxes` kwarg must be correctly set. If `sigclip` is None, no sigma-clipping will be performed, and the time-series (with non-finite elems removed) will be passed through to the output. epdsmooth_windowsize : int This is the number of LC points to smooth over to generate a smoothed light curve that will be used to fit the EPD function. epdsmooth_func : Python function This sets the smoothing filter function to use. A Savitsky-Golay filter is used to smooth the light curve by default. The functions that can be used with this kwarg are listed in `varbase.trends`. If you want to use your own function, it MUST have the following signature:: def smoothfunc(mags_array, window_size, **extraparams) and return a numpy array of the same size as `mags_array` with the smoothed time-series. Any extra params can be provided using the `extraparams` dict. epdsmooth_extraparams : dict This is a dict of any extra filter params to supply to the smoothing function. Returns ------- str Writes the output EPD light curve to a pickle that contains the lcdict with an added `lcdict['epd']` key, which contains the EPD times, mags/fluxes, and errs as `lcdict['epd']['times']`, `lcdict['epd']['mags']`, and `lcdict['epd']['errs']`. Returns the filename of this generated EPD LC pickle file. Notes ----- - S -> measure of PSF sharpness (~1/sigma^2 sosmaller S = wider PSF) - D -> measure of PSF ellipticity in xy direction - K -> measure of PSF ellipticity in cross direction S, D, K are related to the PSF's variance and covariance, see eqn 30-33 in A. Pal's thesis: https://arxiv.org/abs/0906.3486 ''' try: formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir) if formatinfo: (dfileglob, readerfunc, dtimecols, dmagcols, derrcols, magsarefluxes, normfunc) = formatinfo else: LOGERROR("can't figure out the light curve format") return None except Exception as e: LOGEXCEPTION("can't figure out the light curve format") return None lcdict = readerfunc(lcfile) if ((isinstance(lcdict, (tuple, list))) and isinstance(lcdict[0], dict)): lcdict = lcdict[0] objectid = lcdict['objectid'] times, mags, errs = lcdict[timecol], lcdict[magcol], lcdict[errcol] if externalparams is not None: fsv = lcdict[externalparams['fsv']] fdv = lcdict[externalparams['fdv']] fkv = lcdict[externalparams['fkv']] xcc = lcdict[externalparams['xcc']] ycc = lcdict[externalparams['ycc']] bgv = lcdict[externalparams['bgv']] bge = lcdict[externalparams['bge']] iha = lcdict[externalparams['iha']] izd = lcdict[externalparams['izd']] else: fsv = lcdict['fsv'] fdv = lcdict['fdv'] fkv = lcdict['fkv'] xcc = lcdict['xcc'] ycc = lcdict['ycc'] bgv = lcdict['bgv'] bge = lcdict['bge'] iha = lcdict['iha'] izd = lcdict['izd'] # apply the corrections for EPD epd = epd_magseries(times, mags, errs, fsv, fdv, fkv, xcc, ycc, bgv, bge, iha, izd, magsarefluxes=magsarefluxes, epdsmooth_sigclip=epdsmooth_sigclip, epdsmooth_windowsize=epdsmooth_windowsize, epdsmooth_func=epdsmooth_func, epdsmooth_extraparams=epdsmooth_extraparams) # save the EPD magseries to a pickle LC lcdict['epd'] = epd outfile = os.path.join( os.path.dirname(lcfile), '%s-epd-%s-pklc.pkl' % (squeeze(objectid).replace(' ', '-'), magcol)) with open(outfile, 'wb') as outfd: pickle.dump(lcdict, outfd, protocol=pickle.HIGHEST_PROTOCOL) return outfile
def runpf(lcfile, outdir, timecols=None, magcols=None, errcols=None, lcformat='hat-sql', lcformatdir=None, pfmethods=('gls','pdm','mav','win'), pfkwargs=({},{},{},{}), sigclip=10.0, getblssnr=False, nworkers=NCPUS, minobservations=500, excludeprocessed=False, raiseonfail=False): '''This runs the period-finding for a single LC. Parameters ---------- lcfile : str The light curve file to run period-finding on. outdir : str The output directory where the result pickle will go. timecols : list of str or None The timecol keys to use from the lcdict in calculating the features. magcols : list of str or None The magcol keys to use from the lcdict in calculating the features. errcols : list of str or None The errcol keys to use from the lcdict in calculating the features. lcformat : str This is the `formatkey` associated with your light curve format, which you previously passed in to the `lcproc.register_lcformat` function. This will be used to look up how to find and read the light curves specified in `basedir` or `use_list_of_filenames`. lcformatdir : str or None If this is provided, gives the path to a directory when you've stored your lcformat description JSONs, other than the usual directories lcproc knows to search for them in. Use this along with `lcformat` to specify an LC format JSON file that's not currently registered with lcproc. pfmethods : list of str This is a list of period finding methods to run. Each element is a string matching the keys of the `PFMETHODS` dict above. By default, this runs GLS, PDM, AoVMH, and the spectral window Lomb-Scargle periodogram. pfkwargs : list of dicts This is used to provide any special kwargs as dicts to each period-finding method function specified in `pfmethods`. sigclip : float or int or sequence of two floats/ints or None If a single float or int, a symmetric sigma-clip will be performed using the number provided as the sigma-multiplier to cut out from the input time-series. If a list of two ints/floats is provided, the function will perform an 'asymmetric' sigma-clip. The first element in this list is the sigma value to use for fainter flux/mag values; the second element in this list is the sigma value to use for brighter flux/mag values. For example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma dimmings and greater than 3-sigma brightenings. Here the meaning of "dimming" and "brightening" is set by *physics* (not the magnitude system), which is why the `magsarefluxes` kwarg must be correctly set. If `sigclip` is None, no sigma-clipping will be performed, and the time-series (with non-finite elems removed) will be passed through to the output. getblssnr : bool If this is True and BLS is one of the methods specified in `pfmethods`, will also calculate the stats for each best period in the BLS results: transit depth, duration, ingress duration, refit period and epoch, and the SNR of the transit. nworkers : int The number of parallel period-finding workers to launch. minobservations : int The minimum number of finite LC points required to process a light curve. excludeprocessed : bool If this is True, light curves that have existing period-finding result pickles in `outdir` will not be processed. FIXME: currently, this uses a dumb method of excluding already-processed files. A smarter way to do this is to (i) generate a SHA512 cachekey based on a repr of `{'lcfile', 'timecols', 'magcols', 'errcols', 'lcformat', 'pfmethods', 'sigclip', 'getblssnr', 'pfkwargs'}`, (ii) make sure all list kwargs in the dict are sorted, (iii) check if the output file has the same cachekey in its filename (last 8 chars of cachekey should work), so the result was processed in exactly the same way as specifed in the input to this function, and can therefore be ignored. Will implement this later. raiseonfail : bool If something fails and this is True, will raise an Exception instead of returning None at the end. Returns ------- str The path to the output period-finding result pickle. ''' try: formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir) if formatinfo: (dfileglob, readerfunc, dtimecols, dmagcols, derrcols, magsarefluxes, normfunc) = formatinfo else: LOGERROR("can't figure out the light curve format") return None except Exception as e: LOGEXCEPTION("can't figure out the light curve format") return None # override the default timecols, magcols, and errcols # using the ones provided to the function if timecols is None: timecols = dtimecols if magcols is None: magcols = dmagcols if errcols is None: errcols = derrcols try: # get the LC into a dict lcdict = readerfunc(lcfile) # this should handle lists/tuples being returned by readerfunc # we assume that the first element is the actual lcdict # FIXME: figure out how to not need this assumption if ( (isinstance(lcdict, (list, tuple))) and (isinstance(lcdict[0], dict)) ): lcdict = lcdict[0] outfile = os.path.join(outdir, 'periodfinding-%s.pkl' % squeeze(lcdict['objectid']).replace(' ', '-')) # if excludeprocessed is True, return the output file if it exists and # has a size that is at least 100 kilobytes (this should be enough to # contain the minimal results of this function). if excludeprocessed: test_outfile = os.path.exists(outfile) test_outfile_gz = os.path.exists(outfile+'.gz') if (test_outfile and os.stat(outfile).st_size > 102400): LOGWARNING('periodfinding result for %s already exists at %s, ' 'skipping because excludeprocessed=True' % (lcfile, outfile)) return outfile elif (test_outfile_gz and os.stat(outfile+'.gz').st_size > 102400): LOGWARNING( 'gzipped periodfinding result for %s already ' 'exists at %s, skipping because excludeprocessed=True' % (lcfile, outfile+'.gz') ) return outfile+'.gz' # this is the final returndict resultdict = { 'objectid':lcdict['objectid'], 'lcfbasename':os.path.basename(lcfile), 'kwargs':{'timecols':timecols, 'magcols':magcols, 'errcols':errcols, 'lcformat':lcformat, 'lcformatdir':lcformatdir, 'pfmethods':pfmethods, 'pfkwargs':pfkwargs, 'sigclip':sigclip, 'getblssnr':getblssnr} } # normalize using the special function if specified if normfunc is not None: lcdict = normfunc(lcdict) for tcol, mcol, ecol in zip(timecols, magcols, errcols): # dereference the columns and get them from the lcdict if '.' in tcol: tcolget = tcol.split('.') else: tcolget = [tcol] times = _dict_get(lcdict, tcolget) if '.' in mcol: mcolget = mcol.split('.') else: mcolget = [mcol] mags = _dict_get(lcdict, mcolget) if '.' in ecol: ecolget = ecol.split('.') else: ecolget = [ecol] errs = _dict_get(lcdict, ecolget) # normalize here if not using special normalization if normfunc is None: ntimes, nmags = normalize_magseries( times, mags, magsarefluxes=magsarefluxes ) times, mags, errs = ntimes, nmags, errs # run each of the requested period-finder functions resultdict[mcol] = {} # check if we have enough non-nan observations to proceed finmags = mags[np.isfinite(mags)] if finmags.size < minobservations: LOGERROR('not enough non-nan observations for ' 'this LC. have: %s, required: %s, ' 'magcol: %s, skipping...' % (finmags.size, minobservations, mcol)) continue pfmkeys = [] for pfmind, pfm, pfkw in zip(range(len(pfmethods)), pfmethods, pfkwargs): pf_func = PFMETHODS[pfm] # get any optional kwargs for this function pf_kwargs = pfkw pf_kwargs.update({'verbose':False, 'nworkers':nworkers, 'magsarefluxes':magsarefluxes, 'sigclip':sigclip}) # we'll always prefix things with their index to allow multiple # invocations and results from the same period-finder (for # different period ranges, for example). pfmkey = '%s-%s' % (pfmind, pfm) pfmkeys.append(pfmkey) # run this period-finder and save its results to the output dict resultdict[mcol][pfmkey] = pf_func( times, mags, errs, **pf_kwargs ) # # done with running the period finders # # append the pfmkeys list to the magcol dict resultdict[mcol]['pfmethods'] = pfmkeys # check if we need to get the SNR from any BLS pfresults if 'bls' in pfmethods and getblssnr: # we need to scan thru the pfmethods to get to any BLS pfresults for pfmk in resultdict[mcol]['pfmethods']: if 'bls' in pfmk: try: bls = resultdict[mcol][pfmk] # calculate the SNR for the BLS as well blssnr = bls_snr(bls, times, mags, errs, magsarefluxes=magsarefluxes, verbose=False) # add the SNR results to the BLS result dict resultdict[mcol][pfmk].update({ 'snr':blssnr['snr'], 'transitdepth':blssnr['transitdepth'], 'transitduration':blssnr['transitduration'], }) # update the BLS result dict with the refit periods # and epochs using the results from bls_snr resultdict[mcol][pfmk].update({ 'nbestperiods':blssnr['period'], 'epochs':blssnr['epoch'] }) except Exception as e: LOGEXCEPTION('could not calculate BLS SNR for %s' % lcfile) # add the SNR null results to the BLS result dict resultdict[mcol][pfmk].update({ 'snr':[np.nan,np.nan,np.nan,np.nan,np.nan], 'transitdepth':[np.nan,np.nan,np.nan, np.nan,np.nan], 'transitduration':[np.nan,np.nan,np.nan, np.nan,np.nan], }) elif 'bls' in pfmethods: # we need to scan thru the pfmethods to get to any BLS pfresults for pfmk in resultdict[mcol]['pfmethods']: if 'bls' in pfmk: # add the SNR null results to the BLS result dict resultdict[mcol][pfmk].update({ 'snr':[np.nan,np.nan,np.nan,np.nan,np.nan], 'transitdepth':[np.nan,np.nan,np.nan, np.nan,np.nan], 'transitduration':[np.nan,np.nan,np.nan, np.nan,np.nan], }) # once all mag cols have been processed, write out the pickle with open(outfile, 'wb') as outfd: pickle.dump(resultdict, outfd, protocol=pickle.HIGHEST_PROTOCOL) return outfile except Exception as e: LOGEXCEPTION('failed to run for %s, because: %s' % (lcfile, e)) if raiseonfail: raise return None
def parallel_pf_lcdir(lcdir, outdir, fileglob=None, recursive=True, timecols=None, magcols=None, errcols=None, lcformat='hat-sql', lcformatdir=None, pfmethods=('gls','pdm','mav','win'), pfkwargs=({},{},{},{}), sigclip=10.0, getblssnr=False, nperiodworkers=NCPUS, ncontrolworkers=1, liststartindex=None, listmaxobjects=None, minobservations=500, excludeprocessed=True): '''This runs parallel light curve period finding for directory of LCs. Parameters ---------- lcdir : str The directory containing the LCs to process. outdir : str The directory where the resulting period-finding pickles will go. fileglob : str or None The UNIX file glob to use to search for LCs in `lcdir`. If None, the default file glob associated with the registered LC format will be used instead. recursive : bool If True, will search recursively in `lcdir` for light curves to process. timecols : list of str or None The timecol keys to use from the lcdict in calculating the features. magcols : list of str or None The magcol keys to use from the lcdict in calculating the features. errcols : list of str or None The errcol keys to use from the lcdict in calculating the features. lcformat : str This is the `formatkey` associated with your light curve format, which you previously passed in to the `lcproc.register_lcformat` function. This will be used to look up how to find and read the light curves specified in `basedir` or `use_list_of_filenames`. lcformatdir : str or None If this is provided, gives the path to a directory when you've stored your lcformat description JSONs, other than the usual directories lcproc knows to search for them in. Use this along with `lcformat` to specify an LC format JSON file that's not currently registered with lcproc. pfmethods : list of str This is a list of period finding methods to run. Each element is a string matching the keys of the `PFMETHODS` dict above. By default, this runs GLS, PDM, AoVMH, and the spectral window Lomb-Scargle periodogram. pfkwargs : list of dicts This is used to provide any special kwargs as dicts to each period-finding method function specified in `pfmethods`. sigclip : float or int or sequence of two floats/ints or None If a single float or int, a symmetric sigma-clip will be performed using the number provided as the sigma-multiplier to cut out from the input time-series. If a list of two ints/floats is provided, the function will perform an 'asymmetric' sigma-clip. The first element in this list is the sigma value to use for fainter flux/mag values; the second element in this list is the sigma value to use for brighter flux/mag values. For example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma dimmings and greater than 3-sigma brightenings. Here the meaning of "dimming" and "brightening" is set by *physics* (not the magnitude system), which is why the `magsarefluxes` kwarg must be correctly set. If `sigclip` is None, no sigma-clipping will be performed, and the time-series (with non-finite elems removed) will be passed through to the output. getblssnr : bool If this is True and BLS is one of the methods specified in `pfmethods`, will also calculate the stats for each best period in the BLS results: transit depth, duration, ingress duration, refit period and epoch, and the SNR of the transit. nperiodworkers : int The number of parallel period-finding workers to launch per object task. ncontrolworkers : int The number of controlling processes to launch. This effectively sets how many objects from `lclist` will be processed in parallel. liststartindex : int or None This sets the index from where to start in `lclist`. listmaxobjects : int or None This sets the maximum number of objects in `lclist` to run period-finding for in this invocation. Together with `liststartindex`, `listmaxobjects` can be used to distribute processing over several independent machines if the number of light curves is very large. minobservations : int The minimum number of finite LC points required to process a light curve. excludeprocessed : bool If this is True, light curves that have existing period-finding result pickles in `outdir` will not be processed. FIXME: currently, this uses a dumb method of excluding already-processed files. A smarter way to do this is to (i) generate a SHA512 cachekey based on a repr of `{'lcfile', 'timecols', 'magcols', 'errcols', 'lcformat', 'pfmethods', 'sigclip', 'getblssnr', 'pfkwargs'}`, (ii) make sure all list kwargs in the dict are sorted, (iii) check if the output file has the same cachekey in its filename (last 8 chars of cachekey should work), so the result was processed in exactly the same way as specifed in the input to this function, and can therefore be ignored. Will implement this later. Returns ------- list of str A list of the period-finding pickles created for all of input LCs processed. ''' try: formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir) if formatinfo: (dfileglob, readerfunc, dtimecols, dmagcols, derrcols, magsarefluxes, normfunc) = formatinfo else: LOGERROR("can't figure out the light curve format") return None except Exception as e: LOGEXCEPTION("can't figure out the light curve format") return None if not fileglob: fileglob = dfileglob # now find the files LOGINFO('searching for %s light curves in %s ...' % (lcformat, lcdir)) if recursive is False: matching = glob.glob(os.path.join(lcdir, fileglob)) else: # use recursive glob for Python 3.5+ if sys.version_info[:2] > (3,4): matching = glob.glob(os.path.join(lcdir, '**', fileglob),recursive=True) # otherwise, use os.walk and glob else: # use os.walk to go through the directories walker = os.walk(lcdir) matching = [] for root, dirs, _files in walker: for sdir in dirs: searchpath = os.path.join(root, sdir, fileglob) foundfiles = glob.glob(searchpath) if foundfiles: matching.extend(foundfiles) # now that we have all the files, process them if matching and len(matching) > 0: # this helps us process things in deterministic order when we distribute # processing over several machines matching = sorted(matching) LOGINFO('found %s light curves, running pf...' % len(matching)) return parallel_pf(matching, outdir, timecols=timecols, magcols=magcols, errcols=errcols, lcformat=lcformat, lcformatdir=lcformatdir, pfmethods=pfmethods, pfkwargs=pfkwargs, getblssnr=getblssnr, sigclip=sigclip, nperiodworkers=nperiodworkers, ncontrolworkers=ncontrolworkers, liststartindex=liststartindex, listmaxobjects=listmaxobjects, minobservations=minobservations, excludeprocessed=excludeprocessed) else: LOGERROR('no light curve files in %s format found in %s' % (lcformat, lcdir)) return None
def plot_variability_thresholds(varthreshpkl, xmin_lcmad_stdev=5.0, xmin_stetj_stdev=2.0, xmin_iqr_stdev=2.0, xmin_inveta_stdev=2.0, lcformat='hat-sql', lcformatdir=None, magcols=None): '''This makes plots for the variability threshold distributions. Parameters ---------- varthreshpkl : str The pickle produced by the function above. xmin_lcmad_stdev,xmin_stetj_stdev,xmin_iqr_stdev,xmin_inveta_stdev : float or np.array Values of the threshold values to override the ones in the `vartresholdpkl`. If provided, will plot the thresholds accordingly instead of using the ones in the input pickle directly. lcformat : str This is the `formatkey` associated with your light curve format, which you previously passed in to the `lcproc.register_lcformat` function. This will be used to look up how to find and read the light curves specified in `basedir` or `use_list_of_filenames`. lcformatdir : str or None If this is provided, gives the path to a directory when you've stored your lcformat description JSONs, other than the usual directories lcproc knows to search for them in. Use this along with `lcformat` to specify an LC format JSON file that's not currently registered with lcproc. magcols : list of str or None The magcol keys to use from the lcdict. Returns ------- str The file name of the threshold plot generated. ''' try: formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir) if formatinfo: (dfileglob, readerfunc, dtimecols, dmagcols, derrcols, magsarefluxes, normfunc) = formatinfo else: LOGERROR("can't figure out the light curve format") return None except Exception: LOGEXCEPTION("can't figure out the light curve format") return None if magcols is None: magcols = dmagcols with open(varthreshpkl, 'rb') as infd: allobjects = pickle.load(infd) magbins = allobjects['magbins'] for magcol in magcols: min_lcmad_stdev = (xmin_lcmad_stdev or allobjects[magcol]['min_lcmad_stdev']) min_stetj_stdev = (xmin_stetj_stdev or allobjects[magcol]['min_stetj_stdev']) min_iqr_stdev = (xmin_iqr_stdev or allobjects[magcol]['min_iqr_stdev']) min_inveta_stdev = (xmin_inveta_stdev or allobjects[magcol]['min_inveta_stdev']) fig = plt.figure(figsize=(20, 16)) # the mag vs lcmad plt.subplot(221) plt.plot(allobjects[magcol]['sdssr'], allobjects[magcol]['lcmad'] * 1.483, marker='.', ms=1.0, linestyle='none', rasterized=True) plt.plot(allobjects[magcol]['binned_sdssr_median'], np.array(allobjects[magcol]['binned_lcmad_median']) * 1.483, linewidth=3.0) plt.plot(allobjects[magcol]['binned_sdssr_median'], np.array(allobjects[magcol]['binned_lcmad_median']) * 1.483 + min_lcmad_stdev * np.array(allobjects[magcol]['binned_lcmad_stdev']), linewidth=3.0, linestyle='dashed') plt.xlim((magbins.min() - 0.25, magbins.max())) plt.xlabel('SDSS r') plt.ylabel(r'lightcurve RMS (MAD $\times$ 1.483)') plt.title('%s - SDSS r vs. light curve RMS' % magcol) plt.yscale('log') plt.tight_layout() # the mag vs stetsonj plt.subplot(222) plt.plot(allobjects[magcol]['sdssr'], allobjects[magcol]['stetsonj'], marker='.', ms=1.0, linestyle='none', rasterized=True) plt.plot(allobjects[magcol]['binned_sdssr_median'], allobjects[magcol]['binned_stetsonj_median'], linewidth=3.0) plt.plot(allobjects[magcol]['binned_sdssr_median'], np.array(allobjects[magcol]['binned_stetsonj_median']) + min_stetj_stdev * np.array(allobjects[magcol]['binned_stetsonj_stdev']), linewidth=3.0, linestyle='dashed') plt.xlim((magbins.min() - 0.25, magbins.max())) plt.xlabel('SDSS r') plt.ylabel('Stetson J index') plt.title('%s - SDSS r vs. Stetson J index' % magcol) plt.yscale('log') plt.tight_layout() # the mag vs IQR plt.subplot(223) plt.plot(allobjects[magcol]['sdssr'], allobjects[magcol]['iqr'], marker='.', ms=1.0, linestyle='none', rasterized=True) plt.plot(allobjects[magcol]['binned_sdssr_median'], allobjects[magcol]['binned_iqr_median'], linewidth=3.0) plt.plot( allobjects[magcol]['binned_sdssr_median'], np.array(allobjects[magcol]['binned_iqr_median']) + min_iqr_stdev * np.array(allobjects[magcol]['binned_iqr_stdev']), linewidth=3.0, linestyle='dashed') plt.xlabel('SDSS r') plt.ylabel('IQR') plt.title('%s - SDSS r vs. IQR' % magcol) plt.xlim((magbins.min() - 0.25, magbins.max())) plt.yscale('log') plt.tight_layout() # the mag vs IQR plt.subplot(224) plt.plot(allobjects[magcol]['sdssr'], allobjects[magcol]['inveta'], marker='.', ms=1.0, linestyle='none', rasterized=True) plt.plot(allobjects[magcol]['binned_sdssr_median'], allobjects[magcol]['binned_inveta_median'], linewidth=3.0) plt.plot(allobjects[magcol]['binned_sdssr_median'], np.array(allobjects[magcol]['binned_inveta_median']) + min_inveta_stdev * np.array(allobjects[magcol]['binned_inveta_stdev']), linewidth=3.0, linestyle='dashed') plt.xlabel('SDSS r') plt.ylabel(r'$1/\eta$') plt.title(r'%s - SDSS r vs. $1/\eta$' % magcol) plt.xlim((magbins.min() - 0.25, magbins.max())) plt.yscale('log') plt.tight_layout() plt.savefig('varfeatures-%s-%s-distributions.png' % (varthreshpkl, magcol), bbox_inches='tight') plt.close('all')
def variability_threshold(featuresdir, outfile, magbins=DEFAULT_MAGBINS, maxobjects=None, timecols=None, magcols=None, errcols=None, lcformat='hat-sql', lcformatdir=None, min_lcmad_stdev=5.0, min_stetj_stdev=2.0, min_iqr_stdev=2.0, min_inveta_stdev=2.0, verbose=True): '''This generates a list of objects with stetson J, IQR, and 1.0/eta above some threshold value to select them as potential variable stars. Use this to pare down the objects to review and put through period-finding. This does the thresholding per magnitude bin; this should be better than one single cut through the entire magnitude range. Set the magnitude bins using the magbins kwarg. FIXME: implement a voting classifier here. this will choose variables based on the thresholds in IQR, stetson, and inveta based on weighting carried over from the variability recovery sims. Parameters ---------- featuresdir : str This is the directory containing variability feature pickles created by :py:func:`astrobase.lcproc.lcpfeatures.parallel_varfeatures` or similar. outfile : str This is the output pickle file that will contain all the threshold information. magbins : np.array of floats This sets the magnitude bins to use for calculating thresholds. maxobjects : int or None This is the number of objects to process. If None, all objects with feature pickles in `featuresdir` will be processed. timecols : list of str or None The timecol keys to use from the lcdict in calculating the thresholds. magcols : list of str or None The magcol keys to use from the lcdict in calculating the thresholds. errcols : list of str or None The errcol keys to use from the lcdict in calculating the thresholds. lcformat : str This is the `formatkey` associated with your light curve format, which you previously passed in to the `lcproc.register_lcformat` function. This will be used to look up how to find and read the light curves specified in `basedir` or `use_list_of_filenames`. lcformatdir : str or None If this is provided, gives the path to a directory when you've stored your lcformat description JSONs, other than the usual directories lcproc knows to search for them in. Use this along with `lcformat` to specify an LC format JSON file that's not currently registered with lcproc. min_lcmad_stdev,min_stetj_stdev,min_iqr_stdev,min_inveta_stdev : float or np.array These are all the standard deviation multiplier for the distributions of light curve standard deviation, Stetson J variability index, the light curve interquartile range, and 1/eta variability index respectively. These multipliers set the minimum values of these measures to use for selecting variable stars. If provided as floats, the same value will be used for all magbins. If provided as np.arrays of `size = magbins.size - 1`, will be used to apply possibly different sigma cuts for each magbin. verbose : bool If True, will report progress and warn about any problems. Returns ------- dict Contains all of the variability threshold information along with indices into the array of the object IDs chosen as variables. ''' try: formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir) if formatinfo: (dfileglob, readerfunc, dtimecols, dmagcols, derrcols, magsarefluxes, normfunc) = formatinfo else: LOGERROR("can't figure out the light curve format") return None except Exception: LOGEXCEPTION("can't figure out the light curve format") return None # override the default timecols, magcols, and errcols # using the ones provided to the function if timecols is None: timecols = dtimecols if magcols is None: magcols = dmagcols if errcols is None: errcols = derrcols # list of input pickles generated by varfeatures functions above pklist = glob.glob(os.path.join(featuresdir, 'varfeatures-*.pkl')) if maxobjects: pklist = pklist[:maxobjects] allobjects = {} for magcol in magcols: # keep local copies of these so we can fix them independently in case of # nans if (isinstance(min_stetj_stdev, list) or isinstance(min_stetj_stdev, np.ndarray)): magcol_min_stetj_stdev = min_stetj_stdev[::] else: magcol_min_stetj_stdev = min_stetj_stdev if (isinstance(min_iqr_stdev, list) or isinstance(min_iqr_stdev, np.ndarray)): magcol_min_iqr_stdev = min_iqr_stdev[::] else: magcol_min_iqr_stdev = min_iqr_stdev if (isinstance(min_inveta_stdev, list) or isinstance(min_inveta_stdev, np.ndarray)): magcol_min_inveta_stdev = min_inveta_stdev[::] else: magcol_min_inveta_stdev = min_inveta_stdev LOGINFO('getting all object sdssr, LC MAD, stet J, IQR, eta...') # we'll calculate the sigma per magnitude bin, so get the mags as well allobjects[magcol] = { 'objectid': [], 'sdssr': [], 'lcmad': [], 'stetsonj': [], 'iqr': [], 'eta': [] } # fancy progress bar with tqdm if present if TQDM and verbose: listiterator = tqdm(pklist) else: listiterator = pklist for pkl in listiterator: with open(pkl, 'rb') as infd: thisfeatures = pickle.load(infd) objectid = thisfeatures['objectid'] # the object magnitude if ('info' in thisfeatures and thisfeatures['info'] and 'sdssr' in thisfeatures['info']): if (thisfeatures['info']['sdssr'] and thisfeatures['info']['sdssr'] > 3.0): sdssr = thisfeatures['info']['sdssr'] elif (magcol in thisfeatures and thisfeatures[magcol] and 'median' in thisfeatures[magcol] and thisfeatures[magcol]['median'] > 3.0): sdssr = thisfeatures[magcol]['median'] elif (thisfeatures['info']['jmag'] and thisfeatures['info']['hmag'] and thisfeatures['info']['kmag']): sdssr = jhk_to_sdssr(thisfeatures['info']['jmag'], thisfeatures['info']['hmag'], thisfeatures['info']['kmag']) else: sdssr = np.nan else: sdssr = np.nan # the MAD of the light curve if (magcol in thisfeatures and thisfeatures[magcol] and thisfeatures[magcol]['mad']): lcmad = thisfeatures[magcol]['mad'] else: lcmad = np.nan # stetson index if (magcol in thisfeatures and thisfeatures[magcol] and thisfeatures[magcol]['stetsonj']): stetsonj = thisfeatures[magcol]['stetsonj'] else: stetsonj = np.nan # IQR if (magcol in thisfeatures and thisfeatures[magcol] and thisfeatures[magcol]['mag_iqr']): iqr = thisfeatures[magcol]['mag_iqr'] else: iqr = np.nan # eta if (magcol in thisfeatures and thisfeatures[magcol] and thisfeatures[magcol]['eta_normal']): eta = thisfeatures[magcol]['eta_normal'] else: eta = np.nan allobjects[magcol]['objectid'].append(objectid) allobjects[magcol]['sdssr'].append(sdssr) allobjects[magcol]['lcmad'].append(lcmad) allobjects[magcol]['stetsonj'].append(stetsonj) allobjects[magcol]['iqr'].append(iqr) allobjects[magcol]['eta'].append(eta) # # done with collection of info # LOGINFO('finding objects above thresholds per magbin...') # turn the info into arrays allobjects[magcol]['objectid'] = np.ravel( np.array(allobjects[magcol]['objectid'])) allobjects[magcol]['sdssr'] = np.ravel( np.array(allobjects[magcol]['sdssr'])) allobjects[magcol]['lcmad'] = np.ravel( np.array(allobjects[magcol]['lcmad'])) allobjects[magcol]['stetsonj'] = np.ravel( np.array(allobjects[magcol]['stetsonj'])) allobjects[magcol]['iqr'] = np.ravel( np.array(allobjects[magcol]['iqr'])) allobjects[magcol]['eta'] = np.ravel( np.array(allobjects[magcol]['eta'])) # only get finite elements everywhere thisfinind = (np.isfinite(allobjects[magcol]['sdssr']) & np.isfinite(allobjects[magcol]['lcmad']) & np.isfinite(allobjects[magcol]['stetsonj']) & np.isfinite(allobjects[magcol]['iqr']) & np.isfinite(allobjects[magcol]['eta'])) allobjects[magcol]['objectid'] = allobjects[magcol]['objectid'][ thisfinind] allobjects[magcol]['sdssr'] = allobjects[magcol]['sdssr'][thisfinind] allobjects[magcol]['lcmad'] = allobjects[magcol]['lcmad'][thisfinind] allobjects[magcol]['stetsonj'] = allobjects[magcol]['stetsonj'][ thisfinind] allobjects[magcol]['iqr'] = allobjects[magcol]['iqr'][thisfinind] allobjects[magcol]['eta'] = allobjects[magcol]['eta'][thisfinind] # invert eta so we can threshold the same way as the others allobjects[magcol]['inveta'] = 1.0 / allobjects[magcol]['eta'] # do the thresholding by magnitude bin magbininds = np.digitize(allobjects[magcol]['sdssr'], magbins) binned_objectids = [] binned_sdssr = [] binned_sdssr_median = [] binned_lcmad = [] binned_stetsonj = [] binned_iqr = [] binned_inveta = [] binned_count = [] binned_objectids_thresh_stetsonj = [] binned_objectids_thresh_iqr = [] binned_objectids_thresh_inveta = [] binned_objectids_thresh_all = [] binned_lcmad_median = [] binned_lcmad_stdev = [] binned_stetsonj_median = [] binned_stetsonj_stdev = [] binned_inveta_median = [] binned_inveta_stdev = [] binned_iqr_median = [] binned_iqr_stdev = [] # go through all the mag bins and get the thresholds for J, inveta, IQR for mbinind, magi in zip(np.unique(magbininds), range(len(magbins) - 1)): thisbinind = np.where(magbininds == mbinind) thisbin_sdssr_median = (magbins[magi] + magbins[magi + 1]) / 2.0 binned_sdssr_median.append(thisbin_sdssr_median) thisbin_objectids = allobjects[magcol]['objectid'][thisbinind] thisbin_sdssr = allobjects[magcol]['sdssr'][thisbinind] thisbin_lcmad = allobjects[magcol]['lcmad'][thisbinind] thisbin_stetsonj = allobjects[magcol]['stetsonj'][thisbinind] thisbin_iqr = allobjects[magcol]['iqr'][thisbinind] thisbin_inveta = allobjects[magcol]['inveta'][thisbinind] thisbin_count = thisbin_objectids.size if thisbin_count > 4: thisbin_lcmad_median = np.median(thisbin_lcmad) thisbin_lcmad_stdev = np.median( np.abs(thisbin_lcmad - thisbin_lcmad_median)) * 1.483 binned_lcmad_median.append(thisbin_lcmad_median) binned_lcmad_stdev.append(thisbin_lcmad_stdev) thisbin_stetsonj_median = np.median(thisbin_stetsonj) thisbin_stetsonj_stdev = np.median( np.abs(thisbin_stetsonj - thisbin_stetsonj_median)) * 1.483 binned_stetsonj_median.append(thisbin_stetsonj_median) binned_stetsonj_stdev.append(thisbin_stetsonj_stdev) # now get the objects above the required stdev threshold if isinstance(magcol_min_stetj_stdev, float): thisbin_objectids_thresh_stetsonj = thisbin_objectids[ thisbin_stetsonj > ( thisbin_stetsonj_median + magcol_min_stetj_stdev * thisbin_stetsonj_stdev)] elif (isinstance(magcol_min_stetj_stdev, np.ndarray) or isinstance(magcol_min_stetj_stdev, list)): thisbin_min_stetj_stdev = magcol_min_stetj_stdev[magi] if not np.isfinite(thisbin_min_stetj_stdev): LOGWARNING('provided threshold stetson J stdev ' 'for magbin: %.3f is nan, using 2.0' % thisbin_sdssr_median) thisbin_min_stetj_stdev = 2.0 # update the input list/array as well, since we'll be # saving it to the output dict and using it to plot the # variability thresholds magcol_min_stetj_stdev[magi] = 2.0 thisbin_objectids_thresh_stetsonj = thisbin_objectids[ thisbin_stetsonj > ( thisbin_stetsonj_median + thisbin_min_stetj_stdev * thisbin_stetsonj_stdev)] thisbin_iqr_median = np.median(thisbin_iqr) thisbin_iqr_stdev = np.median( np.abs(thisbin_iqr - thisbin_iqr_median)) * 1.483 binned_iqr_median.append(thisbin_iqr_median) binned_iqr_stdev.append(thisbin_iqr_stdev) # get the objects above the required stdev threshold if isinstance(magcol_min_iqr_stdev, float): thisbin_objectids_thresh_iqr = thisbin_objectids[ thisbin_iqr > ( thisbin_iqr_median + magcol_min_iqr_stdev * thisbin_iqr_stdev)] elif (isinstance(magcol_min_iqr_stdev, np.ndarray) or isinstance(magcol_min_iqr_stdev, list)): thisbin_min_iqr_stdev = magcol_min_iqr_stdev[magi] if not np.isfinite(thisbin_min_iqr_stdev): LOGWARNING('provided threshold IQR stdev ' 'for magbin: %.3f is nan, using 2.0' % thisbin_sdssr_median) thisbin_min_iqr_stdev = 2.0 # update the input list/array as well, since we'll be # saving it to the output dict and using it to plot the # variability thresholds magcol_min_iqr_stdev[magi] = 2.0 thisbin_objectids_thresh_iqr = thisbin_objectids[ thisbin_iqr > ( thisbin_iqr_median + thisbin_min_iqr_stdev * thisbin_iqr_stdev)] thisbin_inveta_median = np.median(thisbin_inveta) thisbin_inveta_stdev = np.median( np.abs(thisbin_inveta - thisbin_inveta_median)) * 1.483 binned_inveta_median.append(thisbin_inveta_median) binned_inveta_stdev.append(thisbin_inveta_stdev) if isinstance(magcol_min_inveta_stdev, float): thisbin_objectids_thresh_inveta = thisbin_objectids[ thisbin_inveta > ( thisbin_inveta_median + magcol_min_inveta_stdev * thisbin_inveta_stdev)] elif (isinstance(magcol_min_inveta_stdev, np.ndarray) or isinstance(magcol_min_inveta_stdev, list)): thisbin_min_inveta_stdev = magcol_min_inveta_stdev[magi] if not np.isfinite(thisbin_min_inveta_stdev): LOGWARNING('provided threshold inveta stdev ' 'for magbin: %.3f is nan, using 2.0' % thisbin_sdssr_median) thisbin_min_inveta_stdev = 2.0 # update the input list/array as well, since we'll be # saving it to the output dict and using it to plot the # variability thresholds magcol_min_inveta_stdev[magi] = 2.0 thisbin_objectids_thresh_inveta = thisbin_objectids[ thisbin_inveta > ( thisbin_inveta_median + thisbin_min_inveta_stdev * thisbin_inveta_stdev)] else: thisbin_objectids_thresh_stetsonj = (np.array( [], dtype=np.unicode_)) thisbin_objectids_thresh_iqr = (np.array([], dtype=np.unicode_)) thisbin_objectids_thresh_inveta = (np.array([], dtype=np.unicode_)) # # done with check for enough objects in the bin # # get the intersection of all threshold objects to get objects that # lie above the threshold for all variable indices thisbin_objectids_thresh_all = reduce( np.intersect1d, (thisbin_objectids_thresh_stetsonj, thisbin_objectids_thresh_iqr, thisbin_objectids_thresh_inveta)) binned_objectids.append(thisbin_objectids) binned_sdssr.append(thisbin_sdssr) binned_lcmad.append(thisbin_lcmad) binned_stetsonj.append(thisbin_stetsonj) binned_iqr.append(thisbin_iqr) binned_inveta.append(thisbin_inveta) binned_count.append(thisbin_objectids.size) binned_objectids_thresh_stetsonj.append( thisbin_objectids_thresh_stetsonj) binned_objectids_thresh_iqr.append(thisbin_objectids_thresh_iqr) binned_objectids_thresh_inveta.append( thisbin_objectids_thresh_inveta) binned_objectids_thresh_all.append(thisbin_objectids_thresh_all) # # done with magbins # # update the output dict for this magcol allobjects[magcol]['magbins'] = magbins allobjects[magcol]['binned_objectids'] = binned_objectids allobjects[magcol]['binned_sdssr_median'] = binned_sdssr_median allobjects[magcol]['binned_sdssr'] = binned_sdssr allobjects[magcol]['binned_count'] = binned_count allobjects[magcol]['binned_lcmad'] = binned_lcmad allobjects[magcol]['binned_lcmad_median'] = binned_lcmad_median allobjects[magcol]['binned_lcmad_stdev'] = binned_lcmad_stdev allobjects[magcol]['binned_stetsonj'] = binned_stetsonj allobjects[magcol]['binned_stetsonj_median'] = binned_stetsonj_median allobjects[magcol]['binned_stetsonj_stdev'] = binned_stetsonj_stdev allobjects[magcol]['binned_iqr'] = binned_iqr allobjects[magcol]['binned_iqr_median'] = binned_iqr_median allobjects[magcol]['binned_iqr_stdev'] = binned_iqr_stdev allobjects[magcol]['binned_inveta'] = binned_inveta allobjects[magcol]['binned_inveta_median'] = binned_inveta_median allobjects[magcol]['binned_inveta_stdev'] = binned_inveta_stdev allobjects[magcol]['binned_objectids_thresh_stetsonj'] = ( binned_objectids_thresh_stetsonj) allobjects[magcol]['binned_objectids_thresh_iqr'] = ( binned_objectids_thresh_iqr) allobjects[magcol]['binned_objectids_thresh_inveta'] = ( binned_objectids_thresh_inveta) allobjects[magcol]['binned_objectids_thresh_all'] = ( binned_objectids_thresh_all) # get the common selected objects thru all measures try: allobjects[magcol]['objectids_all_thresh_all_magbins'] = np.unique( np.concatenate( allobjects[magcol]['binned_objectids_thresh_all'])) except ValueError: LOGWARNING('not enough variable objects matching all thresholds') allobjects[magcol]['objectids_all_thresh_all_magbins'] = (np.array( [])) allobjects[magcol][ 'objectids_stetsonj_thresh_all_magbins'] = np.unique( np.concatenate( allobjects[magcol]['binned_objectids_thresh_stetsonj'])) allobjects[magcol]['objectids_inveta_thresh_all_magbins'] = np.unique( np.concatenate( allobjects[magcol]['binned_objectids_thresh_inveta'])) allobjects[magcol]['objectids_iqr_thresh_all_magbins'] = np.unique( np.concatenate(allobjects[magcol]['binned_objectids_thresh_iqr'])) # turn these into np.arrays for easier plotting if they're lists if isinstance(min_stetj_stdev, list): allobjects[magcol]['min_stetj_stdev'] = np.array( magcol_min_stetj_stdev) else: allobjects[magcol]['min_stetj_stdev'] = magcol_min_stetj_stdev if isinstance(min_iqr_stdev, list): allobjects[magcol]['min_iqr_stdev'] = np.array( magcol_min_iqr_stdev) else: allobjects[magcol]['min_iqr_stdev'] = magcol_min_iqr_stdev if isinstance(min_inveta_stdev, list): allobjects[magcol]['min_inveta_stdev'] = np.array( magcol_min_inveta_stdev) else: allobjects[magcol]['min_inveta_stdev'] = magcol_min_inveta_stdev # this one doesn't get touched (for now) allobjects[magcol]['min_lcmad_stdev'] = min_lcmad_stdev # # done with all magcols # allobjects['magbins'] = magbins with open(outfile, 'wb') as outfd: pickle.dump(allobjects, outfd, protocol=pickle.HIGHEST_PROTOCOL) return allobjects
def timebinlc(lcfile, binsizesec, outdir=None, lcformat='hat-sql', lcformatdir=None, timecols=None, magcols=None, errcols=None, minbinelems=7): '''This bins the given light curve file in time using the specified bin size. Parameters ---------- lcfile : str The file name to process. binsizesec : float The time bin-size in seconds. outdir : str or None If this is a str, the output LC will be written to `outdir`. If this is None, the output LC will be written to the same directory as `lcfile`. lcformat : str This is the `formatkey` associated with your light curve format, which you previously passed in to the `lcproc.register_lcformat` function. This will be used to look up how to find and read the light curve file. lcformatdir : str or None If this is provided, gives the path to a directory when you've stored your lcformat description JSONs, other than the usual directories lcproc knows to search for them in. Use this along with `lcformat` to specify an LC format JSON file that's not currently registered with lcproc. timecols,magcols,errcols : lists of str The keys in the lcdict produced by your light curve reader function that correspond to the times, mags/fluxes, and associated measurement errors that will be used as inputs to the binning process. If these are None, the default values for `timecols`, `magcols`, and `errcols` for your light curve format will be used here. minbinelems : int The minimum number of time-bin elements required to accept a time-bin as valid for the output binned light curve. Returns ------- str The name of the output pickle file with the binned LC. Writes the output binned light curve to a pickle that contains the lcdict with an added `lcdict['binned'][magcol]` key, which contains the binned times, mags/fluxes, and errs as `lcdict['binned'][magcol]['times']`, `lcdict['binned'][magcol]['mags']`, and `lcdict['epd'][magcol]['errs']` for each `magcol` provided in the input or default `magcols` value for this light curve format. ''' try: formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir) if formatinfo: (dfileglob, readerfunc, dtimecols, dmagcols, derrcols, magsarefluxes, normfunc) = formatinfo else: LOGERROR("can't figure out the light curve format") return None except Exception as e: LOGEXCEPTION("can't figure out the light curve format") return None # override the default timecols, magcols, and errcols # using the ones provided to the function if timecols is None: timecols = dtimecols if magcols is None: magcols = dmagcols if errcols is None: errcols = derrcols # get the LC into a dict lcdict = readerfunc(lcfile) # this should handle lists/tuples being returned by readerfunc # we assume that the first element is the actual lcdict # FIXME: figure out how to not need this assumption if ((isinstance(lcdict, (list, tuple))) and (isinstance(lcdict[0], dict))): lcdict = lcdict[0] # skip already binned light curves if 'binned' in lcdict: LOGERROR('this light curve appears to be binned already, skipping...') return None lcdict['binned'] = {} for tcol, mcol, ecol in zip(timecols, magcols, errcols): # dereference the columns and get them from the lcdict if '.' in tcol: tcolget = tcol.split('.') else: tcolget = [tcol] times = _dict_get(lcdict, tcolget) if '.' in mcol: mcolget = mcol.split('.') else: mcolget = [mcol] mags = _dict_get(lcdict, mcolget) if '.' in ecol: ecolget = ecol.split('.') else: ecolget = [ecol] errs = _dict_get(lcdict, ecolget) # normalize here if not using special normalization if normfunc is None: ntimes, nmags = normalize_magseries(times, mags, magsarefluxes=magsarefluxes) times, mags, errs = ntimes, nmags, errs # now bin the mag series as requested binned = time_bin_magseries_with_errs(times, mags, errs, binsize=binsizesec, minbinelems=minbinelems) # put this into the special binned key of the lcdict lcdict['binned'][mcol] = { 'times': binned['binnedtimes'], 'mags': binned['binnedmags'], 'errs': binned['binnederrs'], 'nbins': binned['nbins'], 'timebins': binned['jdbins'], 'binsizesec': binsizesec } # done with binning for all magcols, now generate the output file # this will always be a pickle if outdir is None: outdir = os.path.dirname(lcfile) outfile = os.path.join( outdir, '%s-binned%.1fsec-%s.pkl' % (squeeze(lcdict['objectid']).replace(' ', '-'), binsizesec, lcformat)) with open(outfile, 'wb') as outfd: pickle.dump(lcdict, outfd, protocol=pickle.HIGHEST_PROTOCOL) return outfile
def parallel_timebin_lcdir(lcdir, binsizesec, maxobjects=None, outdir=None, lcformat='hat-sql', lcformatdir=None, timecols=None, magcols=None, errcols=None, minbinelems=7, nworkers=NCPUS, maxworkertasks=1000): ''' This time bins all the light curves in the specified directory. Parameters ---------- lcdir : list of str Directory containing the input LCs to process. binsizesec : float The time bin size to use in seconds. maxobjects : int or None If provided, LC processing will stop at `lclist[maxobjects]`. outdir : str or None The directory where output LCs will be written. If None, will write to the same directory as the input LCs. lcformat : str This is the `formatkey` associated with your light curve format, which you previously passed in to the `lcproc.register_lcformat` function. This will be used to look up how to find and read the light curve file. lcformatdir : str or None If this is provided, gives the path to a directory when you've stored your lcformat description JSONs, other than the usual directories lcproc knows to search for them in. Use this along with `lcformat` to specify an LC format JSON file that's not currently registered with lcproc. timecols,magcols,errcols : lists of str The keys in the lcdict produced by your light curve reader function that correspond to the times, mags/fluxes, and associated measurement errors that will be used as inputs to the binning process. If these are None, the default values for `timecols`, `magcols`, and `errcols` for your light curve format will be used here. minbinelems : int The minimum number of time-bin elements required to accept a time-bin as valid for the output binned light curve. nworkers : int Number of parallel workers to launch. maxworkertasks : int The maximum number of tasks a parallel worker will complete before being replaced to guard against memory leaks. Returns ------- dict The returned dict contains keys = input LCs, vals = output LCs. ''' try: formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir) if formatinfo: (fileglob, readerfunc, dtimecols, dmagcols, derrcols, magsarefluxes, normfunc) = formatinfo else: LOGERROR("can't figure out the light curve format") return None except Exception as e: LOGEXCEPTION("can't figure out the light curve format") return None lclist = sorted(glob.glob(os.path.join(lcdir, fileglob))) return parallel_timebin(lclist, binsizesec, maxobjects=maxobjects, outdir=outdir, lcformat=lcformat, timecols=timecols, magcols=magcols, errcols=errcols, minbinelems=minbinelems, nworkers=nworkers, maxworkertasks=maxworkertasks)
def get_starfeatures(lcfile, outdir, kdtree, objlist, lcflist, neighbor_radius_arcsec, deredden=True, custom_bandpasses=None, lcformat='hat-sql', lcformatdir=None): '''This runs the functions from :py:func:`astrobase.varclass.starfeatures` on a single light curve file. Parameters ---------- lcfile : str This is the LC file to extract star features for. outdir : str This is the directory to write the output pickle to. kdtree: scipy.spatial.cKDTree This is a `scipy.spatial.KDTree` or `cKDTree` used to calculate neighbor proximity features. This is for the light curve catalog this object is in. objlist : np.array This is a Numpy array of object IDs in the same order as the `kdtree.data` np.array. This is for the light curve catalog this object is in. lcflist : np.array This is a Numpy array of light curve filenames in the same order as `kdtree.data`. This is for the light curve catalog this object is in. neighbor_radius_arcsec : float This indicates the radius in arcsec to search for neighbors for this object using the light curve catalog's `kdtree`, `objlist`, `lcflist`, and in GAIA. deredden : bool This controls if the colors and any color classifications will be dereddened using 2MASS DUST. custom_bandpasses : dict or None This is a dict used to define any custom bandpasses in the `in_objectinfo` dict you want to make this function aware of and generate colors for. Use the format below for this dict:: { '<bandpass_key_1>':{'dustkey':'<twomass_dust_key_1>', 'label':'<band_label_1>' 'colors':[['<bandkey1>-<bandkey2>', '<BAND1> - <BAND2>'], ['<bandkey3>-<bandkey4>', '<BAND3> - <BAND4>']]}, . ... . '<bandpass_key_N>':{'dustkey':'<twomass_dust_key_N>', 'label':'<band_label_N>' 'colors':[['<bandkey1>-<bandkey2>', '<BAND1> - <BAND2>'], ['<bandkey3>-<bandkey4>', '<BAND3> - <BAND4>']]}, } Where: `bandpass_key` is a key to use to refer to this bandpass in the `objectinfo` dict, e.g. 'sdssg' for SDSS g band `twomass_dust_key` is the key to use in the 2MASS DUST result table for reddening per band-pass. For example, given the following DUST result table (using http://irsa.ipac.caltech.edu/applications/DUST/):: |Filter_name|LamEff |A_over_E_B_V_SandF|A_SandF|A_over_E_B_V_SFD|A_SFD| |char |float |float |float |float |float| | |microns| |mags | |mags | CTIO U 0.3734 4.107 0.209 4.968 0.253 CTIO B 0.4309 3.641 0.186 4.325 0.221 CTIO V 0.5517 2.682 0.137 3.240 0.165 . . ... The `twomass_dust_key` for 'vmag' would be 'CTIO V'. If you want to skip DUST lookup and want to pass in a specific reddening magnitude for your bandpass, use a float for the value of `twomass_dust_key`. If you want to skip DUST lookup entirely for this bandpass, use None for the value of `twomass_dust_key`. `band_label` is the label to use for this bandpass, e.g. 'W1' for WISE-1 band, 'u' for SDSS u, etc. The 'colors' list contains color definitions for all colors you want to generate using this bandpass. this list contains elements of the form:: ['<bandkey1>-<bandkey2>','<BAND1> - <BAND2>'] where the the first item is the bandpass keys making up this color, and the second item is the label for this color to be used by the frontends. An example:: ['sdssu-sdssg','u - g'] lcformat : str This is the `formatkey` associated with your light curve format, which you previously passed in to the `lcproc.register_lcformat` function. This will be used to look up how to find and read the light curves specified in `basedir` or `use_list_of_filenames`. lcformatdir : str or None If this is provided, gives the path to a directory when you've stored your lcformat description JSONs, other than the usual directories lcproc knows to search for them in. Use this along with `lcformat` to specify an LC format JSON file that's not currently registered with lcproc. Returns ------- str Path to the output pickle containing all of the star features for this object. ''' try: formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir) if formatinfo: (dfileglob, readerfunc, dtimecols, dmagcols, derrcols, magsarefluxes, normfunc) = formatinfo else: LOGERROR("can't figure out the light curve format") return None except Exception as e: LOGEXCEPTION("can't figure out the light curve format") return None try: # get the LC into a dict lcdict = readerfunc(lcfile) # this should handle lists/tuples being returned by readerfunc # we assume that the first element is the actual lcdict # FIXME: figure out how to not need this assumption if ((isinstance(lcdict, (list, tuple))) and (isinstance(lcdict[0], dict))): lcdict = lcdict[0] resultdict = { 'objectid': lcdict['objectid'], 'info': lcdict['objectinfo'], 'lcfbasename': os.path.basename(lcfile) } # run the coord features first coordfeat = starfeatures.coord_features(lcdict['objectinfo']) # next, run the color features colorfeat = starfeatures.color_features( lcdict['objectinfo'], deredden=deredden, custom_bandpasses=custom_bandpasses) # run a rough color classification colorclass = starfeatures.color_classification(colorfeat, coordfeat) # finally, run the neighbor features nbrfeat = starfeatures.neighbor_gaia_features(lcdict['objectinfo'], kdtree, neighbor_radius_arcsec) # get the objectids of the neighbors found if any if nbrfeat['nbrindices'].size > 0: nbrfeat['nbrobjectids'] = objlist[nbrfeat['nbrindices']] nbrfeat['closestnbrobjectid'] = objlist[ nbrfeat['closestdistnbrind']] nbrfeat['closestnbrlcfname'] = lcflist[ nbrfeat['closestdistnbrind']] else: nbrfeat['nbrobjectids'] = np.array([]) nbrfeat['closestnbrobjectid'] = np.array([]) nbrfeat['closestnbrlcfname'] = np.array([]) # update the result dict resultdict.update(coordfeat) resultdict.update(colorfeat) resultdict.update(colorclass) resultdict.update(nbrfeat) outfile = os.path.join( outdir, 'starfeatures-%s.pkl' % squeeze(resultdict['objectid']).replace(' ', '-')) with open(outfile, 'wb') as outfd: pickle.dump(resultdict, outfd, protocol=4) return outfile except Exception as e: LOGEXCEPTION('failed to get star features for %s because: %s' % (os.path.basename(lcfile), e)) return None
def parallel_starfeatures_lcdir(lcdir, outdir, lc_catalog_pickle, neighbor_radius_arcsec, fileglob=None, maxobjects=None, deredden=True, custom_bandpasses=None, lcformat='hat-sql', lcformatdir=None, nworkers=NCPUS, recursive=True): '''This runs parallel star feature extraction for a directory of LCs. Parameters ---------- lcdir : list of str The directory to search for light curves. outdir : str The output directory where the results will be placed. lc_catalog_pickle : str The path to a catalog containing at a dict with least: - an object ID array accessible with `dict['objects']['objectid']` - an LC filename array accessible with `dict['objects']['lcfname']` - a `scipy.spatial.KDTree` or `cKDTree` object to use for finding neighbors for each object accessible with `dict['kdtree']` A catalog pickle of the form needed can be produced using :py:func:`astrobase.lcproc.catalogs.make_lclist` or :py:func:`astrobase.lcproc.catalogs.filter_lclist`. neighbor_radius_arcsec : float This indicates the radius in arcsec to search for neighbors for this object using the light curve catalog's `kdtree`, `objlist`, `lcflist`, and in GAIA. fileglob : str The UNIX file glob to use to search for the light curves in `lcdir`. If None, the default value for the light curve format specified will be used. maxobjects : int The number of objects to process from `lclist`. deredden : bool This controls if the colors and any color classifications will be dereddened using 2MASS DUST. custom_bandpasses : dict or None This is a dict used to define any custom bandpasses in the `in_objectinfo` dict you want to make this function aware of and generate colors for. Use the format below for this dict:: { '<bandpass_key_1>':{'dustkey':'<twomass_dust_key_1>', 'label':'<band_label_1>' 'colors':[['<bandkey1>-<bandkey2>', '<BAND1> - <BAND2>'], ['<bandkey3>-<bandkey4>', '<BAND3> - <BAND4>']]}, . ... . '<bandpass_key_N>':{'dustkey':'<twomass_dust_key_N>', 'label':'<band_label_N>' 'colors':[['<bandkey1>-<bandkey2>', '<BAND1> - <BAND2>'], ['<bandkey3>-<bandkey4>', '<BAND3> - <BAND4>']]}, } Where: `bandpass_key` is a key to use to refer to this bandpass in the `objectinfo` dict, e.g. 'sdssg' for SDSS g band `twomass_dust_key` is the key to use in the 2MASS DUST result table for reddening per band-pass. For example, given the following DUST result table (using http://irsa.ipac.caltech.edu/applications/DUST/):: |Filter_name|LamEff |A_over_E_B_V_SandF|A_SandF|A_over_E_B_V_SFD|A_SFD| |char |float |float |float |float |float| | |microns| |mags | |mags | CTIO U 0.3734 4.107 0.209 4.968 0.253 CTIO B 0.4309 3.641 0.186 4.325 0.221 CTIO V 0.5517 2.682 0.137 3.240 0.165 . . ... The `twomass_dust_key` for 'vmag' would be 'CTIO V'. If you want to skip DUST lookup and want to pass in a specific reddening magnitude for your bandpass, use a float for the value of `twomass_dust_key`. If you want to skip DUST lookup entirely for this bandpass, use None for the value of `twomass_dust_key`. `band_label` is the label to use for this bandpass, e.g. 'W1' for WISE-1 band, 'u' for SDSS u, etc. The 'colors' list contains color definitions for all colors you want to generate using this bandpass. this list contains elements of the form:: ['<bandkey1>-<bandkey2>','<BAND1> - <BAND2>'] where the the first item is the bandpass keys making up this color, and the second item is the label for this color to be used by the frontends. An example:: ['sdssu-sdssg','u - g'] lcformat : str This is the `formatkey` associated with your light curve format, which you previously passed in to the `lcproc.register_lcformat` function. This will be used to look up how to find and read the light curves specified in `basedir` or `use_list_of_filenames`. lcformatdir : str or None If this is provided, gives the path to a directory when you've stored your lcformat description JSONs, other than the usual directories lcproc knows to search for them in. Use this along with `lcformat` to specify an LC format JSON file that's not currently registered with lcproc. nworkers : int The number of parallel workers to launch. Returns ------- dict A dict with key:val pairs of the input light curve filename and the output star features pickle for each LC processed. ''' try: formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir) if formatinfo: (dfileglob, readerfunc, dtimecols, dmagcols, derrcols, magsarefluxes, normfunc) = formatinfo else: LOGERROR("can't figure out the light curve format") return None except Exception as e: LOGEXCEPTION("can't figure out the light curve format") return None if not fileglob: fileglob = dfileglob # now find the files LOGINFO('searching for %s light curves in %s ...' % (lcformat, lcdir)) if recursive is False: matching = glob.glob(os.path.join(lcdir, fileglob)) else: # use recursive glob for Python 3.5+ if sys.version_info[:2] > (3, 4): matching = glob.glob(os.path.join(lcdir, '**', fileglob), recursive=True) # otherwise, use os.walk and glob else: # use os.walk to go through the directories walker = os.walk(lcdir) matching = [] for root, dirs, _files in walker: for sdir in dirs: searchpath = os.path.join(root, sdir, fileglob) foundfiles = glob.glob(searchpath) if foundfiles: matching.extend(foundfiles) # now that we have all the files, process them if matching and len(matching) > 0: LOGINFO('found %s light curves, getting starfeatures...' % len(matching)) return parallel_starfeatures(matching, outdir, lc_catalog_pickle, neighbor_radius_arcsec, deredden=deredden, custom_bandpasses=custom_bandpasses, maxobjects=maxobjects, lcformat=lcformat, lcformatdir=lcformatdir, nworkers=nworkers) else: LOGERROR('no light curve files in %s format found in %s' % (lcformat, lcdir)) return None
def parallel_starfeatures(lclist, outdir, lc_catalog_pickle, neighbor_radius_arcsec, maxobjects=None, deredden=True, custom_bandpasses=None, lcformat='hat-sql', lcformatdir=None, nworkers=NCPUS): '''This runs `get_starfeatures` in parallel for all light curves in `lclist`. Parameters ---------- lclist : list of str The list of light curve file names to process. outdir : str The output directory where the results will be placed. lc_catalog_pickle : str The path to a catalog containing at a dict with least: - an object ID array accessible with `dict['objects']['objectid']` - an LC filename array accessible with `dict['objects']['lcfname']` - a `scipy.spatial.KDTree` or `cKDTree` object to use for finding neighbors for each object accessible with `dict['kdtree']` A catalog pickle of the form needed can be produced using :py:func:`astrobase.lcproc.catalogs.make_lclist` or :py:func:`astrobase.lcproc.catalogs.filter_lclist`. neighbor_radius_arcsec : float This indicates the radius in arcsec to search for neighbors for this object using the light curve catalog's `kdtree`, `objlist`, `lcflist`, and in GAIA. maxobjects : int The number of objects to process from `lclist`. deredden : bool This controls if the colors and any color classifications will be dereddened using 2MASS DUST. custom_bandpasses : dict or None This is a dict used to define any custom bandpasses in the `in_objectinfo` dict you want to make this function aware of and generate colors for. Use the format below for this dict:: { '<bandpass_key_1>':{'dustkey':'<twomass_dust_key_1>', 'label':'<band_label_1>' 'colors':[['<bandkey1>-<bandkey2>', '<BAND1> - <BAND2>'], ['<bandkey3>-<bandkey4>', '<BAND3> - <BAND4>']]}, . ... . '<bandpass_key_N>':{'dustkey':'<twomass_dust_key_N>', 'label':'<band_label_N>' 'colors':[['<bandkey1>-<bandkey2>', '<BAND1> - <BAND2>'], ['<bandkey3>-<bandkey4>', '<BAND3> - <BAND4>']]}, } Where: `bandpass_key` is a key to use to refer to this bandpass in the `objectinfo` dict, e.g. 'sdssg' for SDSS g band `twomass_dust_key` is the key to use in the 2MASS DUST result table for reddening per band-pass. For example, given the following DUST result table (using http://irsa.ipac.caltech.edu/applications/DUST/):: |Filter_name|LamEff |A_over_E_B_V_SandF|A_SandF|A_over_E_B_V_SFD|A_SFD| |char |float |float |float |float |float| | |microns| |mags | |mags | CTIO U 0.3734 4.107 0.209 4.968 0.253 CTIO B 0.4309 3.641 0.186 4.325 0.221 CTIO V 0.5517 2.682 0.137 3.240 0.165 . . ... The `twomass_dust_key` for 'vmag' would be 'CTIO V'. If you want to skip DUST lookup and want to pass in a specific reddening magnitude for your bandpass, use a float for the value of `twomass_dust_key`. If you want to skip DUST lookup entirely for this bandpass, use None for the value of `twomass_dust_key`. `band_label` is the label to use for this bandpass, e.g. 'W1' for WISE-1 band, 'u' for SDSS u, etc. The 'colors' list contains color definitions for all colors you want to generate using this bandpass. this list contains elements of the form:: ['<bandkey1>-<bandkey2>','<BAND1> - <BAND2>'] where the the first item is the bandpass keys making up this color, and the second item is the label for this color to be used by the frontends. An example:: ['sdssu-sdssg','u - g'] lcformat : str This is the `formatkey` associated with your light curve format, which you previously passed in to the `lcproc.register_lcformat` function. This will be used to look up how to find and read the light curves specified in `basedir` or `use_list_of_filenames`. lcformatdir : str or None If this is provided, gives the path to a directory when you've stored your lcformat description JSONs, other than the usual directories lcproc knows to search for them in. Use this along with `lcformat` to specify an LC format JSON file that's not currently registered with lcproc. nworkers : int The number of parallel workers to launch. Returns ------- dict A dict with key:val pairs of the input light curve filename and the output star features pickle for each LC processed. ''' try: formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir) if formatinfo: (dfileglob, readerfunc, dtimecols, dmagcols, derrcols, magsarefluxes, normfunc) = formatinfo else: LOGERROR("can't figure out the light curve format") return None except Exception as e: LOGEXCEPTION("can't figure out the light curve format") return None # make sure to make the output directory if it doesn't exist if not os.path.exists(outdir): os.makedirs(outdir) if maxobjects: lclist = lclist[:maxobjects] # read in the kdtree pickle with open(lc_catalog_pickle, 'rb') as infd: kdt_dict = pickle.load(infd) kdt = kdt_dict['kdtree'] objlist = kdt_dict['objects']['objectid'] objlcfl = kdt_dict['objects']['lcfname'] tasks = [(x, outdir, kdt, objlist, objlcfl, neighbor_radius_arcsec, deredden, custom_bandpasses, lcformat) for x in lclist] with ProcessPoolExecutor(max_workers=nworkers) as executor: resultfutures = executor.map(_starfeatures_worker, tasks) results = [x for x in resultfutures] resdict = {os.path.basename(x): y for (x, y) in zip(lclist, results)} return resdict