Ejemplo n.º 1
0
def get_varfeatures(lcfile,
                    outdir,
                    timecols=None,
                    magcols=None,
                    errcols=None,
                    mindet=1000,
                    lcformat='hat-sql',
                    lcformatdir=None):
    '''This runs :py:func:`astrobase.varclass.varfeatures.all_nonperiodic_features`
    on a single LC file.

    Parameters
    ----------

    lcfile : str
        The input light curve to process.

    outfile : str
        The filename of the output variable features pickle that will be
        generated.

    timecols : list of str or None
        The timecol keys to use from the lcdict in calculating the features.

    magcols : list of str or None
        The magcol keys to use from the lcdict in calculating the features.

    errcols : list of str or None
        The errcol keys to use from the lcdict in calculating the features.

    mindet : int
        The minimum number of LC points required to generate variability
        features.

    lcformat : str
        This is the `formatkey` associated with your light curve format, which
        you previously passed in to the `lcproc.register_lcformat`
        function. This will be used to look up how to find and read the light
        curves specified in `basedir` or `use_list_of_filenames`.

    lcformatdir : str or None
        If this is provided, gives the path to a directory when you've stored
        your lcformat description JSONs, other than the usual directories lcproc
        knows to search for them in. Use this along with `lcformat` to specify
        an LC format JSON file that's not currently registered with lcproc.

    Returns
    -------

    str
        The generated variability features pickle for the input LC, with results
        for each magcol in the input `magcol` or light curve format's default
        `magcol` list.

    '''

    try:
        formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir)
        if formatinfo:
            (dfileglob, readerfunc, dtimecols, dmagcols, derrcols,
             magsarefluxes, normfunc) = formatinfo
        else:
            LOGERROR("can't figure out the light curve format")
            return None
    except Exception:
        LOGEXCEPTION("can't figure out the light curve format")
        return None

    # override the default timecols, magcols, and errcols
    # using the ones provided to the function
    if timecols is None:
        timecols = dtimecols
    if magcols is None:
        magcols = dmagcols
    if errcols is None:
        errcols = derrcols

    try:

        # get the LC into a dict
        lcdict = readerfunc(lcfile)

        # this should handle lists/tuples being returned by readerfunc
        # we assume that the first element is the actual lcdict
        # FIXME: figure out how to not need this assumption
        if ((isinstance(lcdict, (list, tuple)))
                and (isinstance(lcdict[0], dict))):
            lcdict = lcdict[0]

        resultdict = {
            'objectid': lcdict['objectid'],
            'info': lcdict['objectinfo'],
            'lcfbasename': os.path.basename(lcfile)
        }

        # normalize using the special function if specified
        if normfunc is not None:
            lcdict = normfunc(lcdict)

        for tcol, mcol, ecol in zip(timecols, magcols, errcols):

            # dereference the columns and get them from the lcdict
            if '.' in tcol:
                tcolget = tcol.split('.')
            else:
                tcolget = [tcol]
            times = _dict_get(lcdict, tcolget)

            if '.' in mcol:
                mcolget = mcol.split('.')
            else:
                mcolget = [mcol]
            mags = _dict_get(lcdict, mcolget)

            if '.' in ecol:
                ecolget = ecol.split('.')
            else:
                ecolget = [ecol]
            errs = _dict_get(lcdict, ecolget)

            # normalize here if not using special normalization
            if normfunc is None:
                ntimes, nmags = normalize_magseries(
                    times, mags, magsarefluxes=magsarefluxes)

                times, mags, errs = ntimes, nmags, errs

            # make sure we have finite values
            finind = np.isfinite(times) & np.isfinite(mags) & np.isfinite(errs)

            # make sure we have enough finite values
            if mags[finind].size < mindet:

                LOGINFO('not enough LC points: %s in normalized %s LC: %s' %
                        (mags[finind].size, mcol, os.path.basename(lcfile)))
                resultdict[mcol] = None

            else:

                # get the features for this magcol
                lcfeatures = varfeatures.all_nonperiodic_features(
                    times, mags, errs)
                resultdict[mcol] = lcfeatures

        # now that we've collected all the magcols, we can choose which is the
        # "best" magcol. this is defined as the magcol that gives us the
        # smallest LC MAD.

        try:
            magmads = np.zeros(len(magcols))
            for mind, mcol in enumerate(magcols):
                if '.' in mcol:
                    mcolget = mcol.split('.')
                else:
                    mcolget = [mcol]

                magmads[mind] = resultdict[mcol]['mad']

            # smallest MAD index
            bestmagcolind = np.where(magmads == np.min(magmads))[0]
            resultdict['bestmagcol'] = magcols[bestmagcolind]

        except Exception:
            resultdict['bestmagcol'] = None

        outfile = os.path.join(
            outdir, 'varfeatures-%s.pkl' %
            squeeze(resultdict['objectid']).replace(' ', '-'))

        with open(outfile, 'wb') as outfd:
            pickle.dump(resultdict, outfd, protocol=4)

        return outfile

    except Exception as e:

        LOGEXCEPTION('failed to get LC features for %s because: %s' %
                     (os.path.basename(lcfile), e))
        return None
Ejemplo n.º 2
0
def parallel_periodicfeatures_lcdir(
    pfpkl_dir,
    lcbasedir,
    outdir,
    pfpkl_glob='periodfinding-*.pkl*',
    starfeaturesdir=None,
    fourierorder=5,
    # these are depth, duration, ingress duration
    transitparams=(-0.01, 0.1, 0.1),
    # these are depth, duration, depth ratio, secphase
    ebparams=(-0.2, 0.3, 0.7, 0.5),
    pdiff_threshold=1.0e-4,
    sidereal_threshold=1.0e-4,
    sampling_peak_multiplier=5.0,
    sampling_startp=None,
    sampling_endp=None,
    timecols=None,
    magcols=None,
    errcols=None,
    lcformat='hat-sql',
    lcformatdir=None,
    sigclip=10.0,
    verbose=False,
    maxobjects=None,
    nworkers=NCPUS,
    recursive=True,
):
    '''This runs parallel periodicfeature extraction for a directory of
    periodfinding result pickles.

    Parameters
    ----------

    pfpkl_dir : str
        The directory containing the pickles to process.

    lcbasedir : str
        The directory where all of the associated light curve files are located.

    outdir : str
        The directory where all the output will be written.

    pfpkl_glob : str
        The UNIX file glob to use to search for period-finder result pickles in
        `pfpkl_dir`.

    starfeaturesdir : str or None
        The directory containing the `starfeatures-<objectid>.pkl` files for
        each object to use calculate neighbor proximity light curve features.

    fourierorder : int
        The Fourier order to use to generate sinusoidal function and fit that to
        the phased light curve.

    transitparams : list of floats
        The transit depth, duration, and ingress duration to use to generate a
        trapezoid planet transit model fit to the phased light curve. The period
        used is the one provided in `period`, while the epoch is automatically
        obtained from a spline fit to the phased light curve.

    ebparams : list of floats
        The primary eclipse depth, eclipse duration, the primary-secondary depth
        ratio, and the phase of the secondary eclipse to use to generate an
        eclipsing binary model fit to the phased light curve. The period used is
        the one provided in `period`, while the epoch is automatically obtained
        from a spline fit to the phased light curve.

    pdiff_threshold : float
        This is the max difference between periods to consider them the same.

    sidereal_threshold : float
        This is the max difference between any of the 'best' periods and the
        sidereal day periods to consider them the same.

    sampling_peak_multiplier : float
        This is the minimum multiplicative factor of a 'best' period's
        normalized periodogram peak over the sampling periodogram peak at the
        same period required to accept the 'best' period as possibly real.

    sampling_startp, sampling_endp : float
        If the `pgramlist` doesn't have a time-sampling Lomb-Scargle
        periodogram, it will be obtained automatically. Use these kwargs to
        control the minimum and maximum period interval to be searched when
        generating this periodogram.

    timecols : list of str or None
        The timecol keys to use from the lcdict in calculating the features.

    magcols : list of str or None
        The magcol keys to use from the lcdict in calculating the features.

    errcols : list of str or None
        The errcol keys to use from the lcdict in calculating the features.

    lcformat : str
        This is the `formatkey` associated with your light curve format, which
        you previously passed in to the `lcproc.register_lcformat`
        function. This will be used to look up how to find and read the light
        curves specified in `basedir` or `use_list_of_filenames`.

    lcformatdir : str or None
        If this is provided, gives the path to a directory when you've stored
        your lcformat description JSONs, other than the usual directories lcproc
        knows to search for them in. Use this along with `lcformat` to specify
        an LC format JSON file that's not currently registered with lcproc.

    sigclip : float or int or sequence of two floats/ints or None
        If a single float or int, a symmetric sigma-clip will be performed using
        the number provided as the sigma-multiplier to cut out from the input
        time-series.

        If a list of two ints/floats is provided, the function will perform an
        'asymmetric' sigma-clip. The first element in this list is the sigma
        value to use for fainter flux/mag values; the second element in this
        list is the sigma value to use for brighter flux/mag values. For
        example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma
        dimmings and greater than 3-sigma brightenings. Here the meaning of
        "dimming" and "brightening" is set by *physics* (not the magnitude
        system), which is why the `magsarefluxes` kwarg must be correctly set.

        If `sigclip` is None, no sigma-clipping will be performed, and the
        time-series (with non-finite elems removed) will be passed through to
        the output.

    verbose : bool
        If True, will indicate progress while working.

    maxobjects : int
        The total number of objects to process from `pfpkl_list`.

    nworkers : int
        The number of parallel workers to launch to process the input.

    Returns
    -------

    dict
        A dict containing key: val pairs of the input period-finder result and
        the output periodic feature result pickles for each input pickle is
        returned.

    '''

    try:
        formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir)
        if formatinfo:
            (dfileglob, readerfunc, dtimecols, dmagcols, derrcols,
             magsarefluxes, normfunc) = formatinfo
        else:
            LOGERROR("can't figure out the light curve format")
            return None
    except Exception:
        LOGEXCEPTION("can't figure out the light curve format")
        return None

    fileglob = pfpkl_glob

    # now find the files
    LOGINFO('searching for periodfinding pickles in %s ...' % pfpkl_dir)

    if recursive is False:
        matching = glob.glob(os.path.join(pfpkl_dir, fileglob))

    else:
        matching = glob.glob(os.path.join(pfpkl_dir, '**', fileglob),
                             recursive=True)

    # now that we have all the files, process them
    if matching and len(matching) > 0:

        LOGINFO('found %s periodfinding pickles, getting periodicfeatures...' %
                len(matching))

        return parallel_periodicfeatures(
            matching,
            lcbasedir,
            outdir,
            starfeaturesdir=starfeaturesdir,
            fourierorder=fourierorder,
            transitparams=transitparams,
            ebparams=ebparams,
            pdiff_threshold=pdiff_threshold,
            sidereal_threshold=sidereal_threshold,
            sampling_peak_multiplier=sampling_peak_multiplier,
            sampling_startp=sampling_startp,
            sampling_endp=sampling_endp,
            timecols=timecols,
            magcols=magcols,
            errcols=errcols,
            lcformat=lcformat,
            lcformatdir=lcformatdir,
            sigclip=sigclip,
            verbose=verbose,
            maxobjects=maxobjects,
            nworkers=nworkers,
        )

    else:

        LOGERROR('no periodfinding pickles found in %s' % (pfpkl_dir))
        return None
Ejemplo n.º 3
0
def parallel_varfeatures_lcdir(lcdir,
                               outdir,
                               fileglob=None,
                               maxobjects=None,
                               timecols=None,
                               magcols=None,
                               errcols=None,
                               recursive=True,
                               mindet=1000,
                               lcformat='hat-sql',
                               lcformatdir=None,
                               nworkers=NCPUS):
    '''This runs parallel variable feature extraction for a directory of LCs.

    Parameters
    ----------

    lcdir : str
        The directory of light curve files to process.

    outdir : str
        The directory where the output varfeatures pickle files will be written.

    fileglob : str or None
        The file glob to use when looking for light curve files in `lcdir`. If
        None, the default file glob associated for this LC format will be used.

    maxobjects : int
        The number of LCs to process from `lclist`.

    timecols : list of str or None
        The timecol keys to use from the lcdict in calculating the features.

    magcols : list of str or None
        The magcol keys to use from the lcdict in calculating the features.

    errcols : list of str or None
        The errcol keys to use from the lcdict in calculating the features.

    mindet : int
        The minimum number of LC points required to generate variability
        features.

    lcformat : str
        This is the `formatkey` associated with your light curve format, which
        you previously passed in to the `lcproc.register_lcformat`
        function. This will be used to look up how to find and read the light
        curves specified in `basedir` or `use_list_of_filenames`.

    lcformatdir : str or None
        If this is provided, gives the path to a directory when you've stored
        your lcformat description JSONs, other than the usual directories lcproc
        knows to search for them in. Use this along with `lcformat` to specify
        an LC format JSON file that's not currently registered with lcproc.

    nworkers : int
        The number of parallel workers to launch.

    Returns
    -------

    dict
        A dict with key:val pairs of input LC file name : the generated
        variability features pickles for each of the input LCs, with results for
        each magcol in the input `magcol` or light curve format's default
        `magcol` list.

    '''

    try:
        formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir)
        if formatinfo:
            (dfileglob, readerfunc, dtimecols, dmagcols, derrcols,
             magsarefluxes, normfunc) = formatinfo
        else:
            LOGERROR("can't figure out the light curve format")
            return None
    except Exception:
        LOGEXCEPTION("can't figure out the light curve format")
        return None

    if not fileglob:
        fileglob = dfileglob

    # now find the files
    LOGINFO('searching for %s light curves in %s ...' % (lcformat, lcdir))

    if recursive is False:
        matching = glob.glob(os.path.join(lcdir, fileglob))

    else:
        matching = glob.glob(os.path.join(lcdir, '**', fileglob),
                             recursive=True)

    # now that we have all the files, process them
    if matching and len(matching) > 0:

        LOGINFO('found %s light curves, getting varfeatures...' %
                len(matching))

        return parallel_varfeatures(matching,
                                    outdir,
                                    maxobjects=maxobjects,
                                    timecols=timecols,
                                    magcols=magcols,
                                    errcols=errcols,
                                    mindet=mindet,
                                    lcformat=lcformat,
                                    lcformatdir=lcformatdir,
                                    nworkers=nworkers)

    else:

        LOGERROR('no light curve files in %s format found in %s' %
                 (lcformat, lcdir))
        return None
Ejemplo n.º 4
0
def serial_periodicfeatures(
        pfpkl_list,
        lcbasedir,
        outdir,
        starfeaturesdir=None,
        fourierorder=5,
        # these are depth, duration, ingress duration
        transitparams=(-0.01, 0.1, 0.1),
        # these are depth, duration, depth ratio, secphase
        ebparams=(-0.2, 0.3, 0.7, 0.5),
        pdiff_threshold=1.0e-4,
        sidereal_threshold=1.0e-4,
        sampling_peak_multiplier=5.0,
        sampling_startp=None,
        sampling_endp=None,
        starfeatures=None,
        timecols=None,
        magcols=None,
        errcols=None,
        lcformat='hat-sql',
        lcformatdir=None,
        sigclip=10.0,
        verbose=False,
        maxobjects=None):
    '''This drives the periodicfeatures collection for a list of periodfinding
    pickles.

    Parameters
    ----------

    pfpkl_list : list of str
        The list of period-finding pickles to use.

    lcbasedir : str
        The base directory where the associated light curves are located.

    outdir : str
        The directory where the results will be written.

    starfeaturesdir : str or None
        The directory containing the `starfeatures-<objectid>.pkl` files for
        each object to use calculate neighbor proximity light curve features.

    fourierorder : int
        The Fourier order to use to generate sinusoidal function and fit that to
        the phased light curve.

    transitparams : list of floats
        The transit depth, duration, and ingress duration to use to generate a
        trapezoid planet transit model fit to the phased light curve. The period
        used is the one provided in `period`, while the epoch is automatically
        obtained from a spline fit to the phased light curve.

    ebparams : list of floats
        The primary eclipse depth, eclipse duration, the primary-secondary depth
        ratio, and the phase of the secondary eclipse to use to generate an
        eclipsing binary model fit to the phased light curve. The period used is
        the one provided in `period`, while the epoch is automatically obtained
        from a spline fit to the phased light curve.

    pdiff_threshold : float
        This is the max difference between periods to consider them the same.

    sidereal_threshold : float
        This is the max difference between any of the 'best' periods and the
        sidereal day periods to consider them the same.

    sampling_peak_multiplier : float
        This is the minimum multiplicative factor of a 'best' period's
        normalized periodogram peak over the sampling periodogram peak at the
        same period required to accept the 'best' period as possibly real.

    sampling_startp, sampling_endp : float
        If the `pgramlist` doesn't have a time-sampling Lomb-Scargle
        periodogram, it will be obtained automatically. Use these kwargs to
        control the minimum and maximum period interval to be searched when
        generating this periodogram.

    timecols : list of str or None
        The timecol keys to use from the lcdict in calculating the features.

    magcols : list of str or None
        The magcol keys to use from the lcdict in calculating the features.

    errcols : list of str or None
        The errcol keys to use from the lcdict in calculating the features.

    lcformat : str
        This is the `formatkey` associated with your light curve format, which
        you previously passed in to the `lcproc.register_lcformat`
        function. This will be used to look up how to find and read the light
        curves specified in `basedir` or `use_list_of_filenames`.

    lcformatdir : str or None
        If this is provided, gives the path to a directory when you've stored
        your lcformat description JSONs, other than the usual directories lcproc
        knows to search for them in. Use this along with `lcformat` to specify
        an LC format JSON file that's not currently registered with lcproc.

    sigclip : float or int or sequence of two floats/ints or None
        If a single float or int, a symmetric sigma-clip will be performed using
        the number provided as the sigma-multiplier to cut out from the input
        time-series.

        If a list of two ints/floats is provided, the function will perform an
        'asymmetric' sigma-clip. The first element in this list is the sigma
        value to use for fainter flux/mag values; the second element in this
        list is the sigma value to use for brighter flux/mag values. For
        example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma
        dimmings and greater than 3-sigma brightenings. Here the meaning of
        "dimming" and "brightening" is set by *physics* (not the magnitude
        system), which is why the `magsarefluxes` kwarg must be correctly set.

        If `sigclip` is None, no sigma-clipping will be performed, and the
        time-series (with non-finite elems removed) will be passed through to
        the output.

    verbose : bool
        If True, will indicate progress while working.

    maxobjects : int
        The total number of objects to process from `pfpkl_list`.

    Returns
    -------

    Nothing.

    '''

    try:
        formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir)
        if formatinfo:
            (fileglob, readerfunc, dtimecols, dmagcols, derrcols,
             magsarefluxes, normfunc) = formatinfo
        else:
            LOGERROR("can't figure out the light curve format")
            return None
    except Exception:
        LOGEXCEPTION("can't figure out the light curve format")
        return None

    # make sure to make the output directory if it doesn't exist
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    if maxobjects:
        pfpkl_list = pfpkl_list[:maxobjects]

    LOGINFO('%s periodfinding pickles to process' % len(pfpkl_list))

    # if the starfeaturedir is provided, try to find a starfeatures pickle for
    # each periodfinding pickle in pfpkl_list
    if starfeaturesdir and os.path.exists(starfeaturesdir):

        starfeatures_list = []

        LOGINFO('collecting starfeatures pickles...')

        for pfpkl in pfpkl_list:

            sfpkl1 = os.path.basename(pfpkl).replace('periodfinding',
                                                     'starfeatures')
            sfpkl2 = sfpkl1.replace('.gz', '')

            sfpath1 = os.path.join(starfeaturesdir, sfpkl1)
            sfpath2 = os.path.join(starfeaturesdir, sfpkl2)

            if os.path.exists(sfpath1):
                starfeatures_list.append(sfpkl1)
            elif os.path.exists(sfpath2):
                starfeatures_list.append(sfpkl2)
            else:
                starfeatures_list.append(None)

    else:

        starfeatures_list = [None for x in pfpkl_list]

    # generate the task list
    kwargs = {
        'fourierorder': fourierorder,
        'transitparams': transitparams,
        'ebparams': ebparams,
        'pdiff_threshold': pdiff_threshold,
        'sidereal_threshold': sidereal_threshold,
        'sampling_peak_multiplier': sampling_peak_multiplier,
        'sampling_startp': sampling_startp,
        'sampling_endp': sampling_endp,
        'timecols': timecols,
        'magcols': magcols,
        'errcols': errcols,
        'lcformat': lcformat,
        'lcformatdir': lcformatdir,
        'sigclip': sigclip,
        'verbose': verbose
    }

    tasks = [(x, lcbasedir, outdir, y, kwargs)
             for (x, y) in zip(pfpkl_list, starfeatures_list)]

    LOGINFO('processing periodfinding pickles...')

    for task in tqdm(tasks):
        _periodicfeatures_worker(task)
Ejemplo n.º 5
0
def get_periodicfeatures(
        pfpickle,
        lcbasedir,
        outdir,
        fourierorder=5,
        # these are depth, duration, ingress duration
        transitparams=(-0.01, 0.1, 0.1),
        # these are depth, duration, depth ratio, secphase
        ebparams=(-0.2, 0.3, 0.7, 0.5),
        pdiff_threshold=1.0e-4,
        sidereal_threshold=1.0e-4,
        sampling_peak_multiplier=5.0,
        sampling_startp=None,
        sampling_endp=None,
        starfeatures=None,
        timecols=None,
        magcols=None,
        errcols=None,
        lcformat='hat-sql',
        lcformatdir=None,
        sigclip=10.0,
        verbose=True,
        raiseonfail=False):
    '''This gets all periodic features for the object.

    Parameters
    ----------

    pfpickle : str
        The period-finding result pickle containing period-finder results to use
        for the calculation of LC fit, periodogram, and phased LC features.

    lcbasedir : str
        The base directory where the light curve for the current object is
        located.

    outdir : str
        The output directory where the results will be written.

    fourierorder : int
        The Fourier order to use to generate sinusoidal function and fit that to
        the phased light curve.

    transitparams : list of floats
        The transit depth, duration, and ingress duration to use to generate a
        trapezoid planet transit model fit to the phased light curve. The period
        used is the one provided in `period`, while the epoch is automatically
        obtained from a spline fit to the phased light curve.

    ebparams : list of floats
        The primary eclipse depth, eclipse duration, the primary-secondary depth
        ratio, and the phase of the secondary eclipse to use to generate an
        eclipsing binary model fit to the phased light curve. The period used is
        the one provided in `period`, while the epoch is automatically obtained
        from a spline fit to the phased light curve.

    pdiff_threshold : float
        This is the max difference between periods to consider them the same.

    sidereal_threshold : float
        This is the max difference between any of the 'best' periods and the
        sidereal day periods to consider them the same.

    sampling_peak_multiplier : float
        This is the minimum multiplicative factor of a 'best' period's
        normalized periodogram peak over the sampling periodogram peak at the
        same period required to accept the 'best' period as possibly real.

    sampling_startp, sampling_endp : float
        If the `pgramlist` doesn't have a time-sampling Lomb-Scargle
        periodogram, it will be obtained automatically. Use these kwargs to
        control the minimum and maximum period interval to be searched when
        generating this periodogram.

    starfeatures : str or None
        If not None, this should be the filename of the
        `starfeatures-<objectid>.pkl` created by
        :py:func:`astrobase.lcproc.lcsfeatures.get_starfeatures` for this
        object. This is used to get the neighbor's light curve and phase it with
        this object's period to see if this object is blended.

    timecols : list of str or None
        The timecol keys to use from the lcdict in calculating the features.

    magcols : list of str or None
        The magcol keys to use from the lcdict in calculating the features.

    errcols : list of str or None
        The errcol keys to use from the lcdict in calculating the features.

    lcformat : str
        This is the `formatkey` associated with your light curve format, which
        you previously passed in to the `lcproc.register_lcformat`
        function. This will be used to look up how to find and read the light
        curves specified in `basedir` or `use_list_of_filenames`.

    lcformatdir : str or None
        If this is provided, gives the path to a directory when you've stored
        your lcformat description JSONs, other than the usual directories lcproc
        knows to search for them in. Use this along with `lcformat` to specify
        an LC format JSON file that's not currently registered with lcproc.

    sigclip : float or int or sequence of two floats/ints or None
        If a single float or int, a symmetric sigma-clip will be performed using
        the number provided as the sigma-multiplier to cut out from the input
        time-series.

        If a list of two ints/floats is provided, the function will perform an
        'asymmetric' sigma-clip. The first element in this list is the sigma
        value to use for fainter flux/mag values; the second element in this
        list is the sigma value to use for brighter flux/mag values. For
        example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma
        dimmings and greater than 3-sigma brightenings. Here the meaning of
        "dimming" and "brightening" is set by *physics* (not the magnitude
        system), which is why the `magsarefluxes` kwarg must be correctly set.

        If `sigclip` is None, no sigma-clipping will be performed, and the
        time-series (with non-finite elems removed) will be passed through to
        the output.

    verbose : bool
        If True, will indicate progress while working.

    raiseonfail : bool
        If True, will raise an Exception if something goes wrong.

    Returns
    -------

    str
        Returns a filename for the output pickle containing all of the periodic
        features for the input object's LC.

    '''

    try:
        formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir)
        if formatinfo:
            (fileglob, readerfunc, dtimecols, dmagcols, derrcols,
             magsarefluxes, normfunc) = formatinfo
        else:
            LOGERROR("can't figure out the light curve format")
            return None
    except Exception:
        LOGEXCEPTION("can't figure out the light curve format")
        return None

    # open the pfpickle
    if pfpickle.endswith('.gz'):
        infd = gzip.open(pfpickle)
    else:
        infd = open(pfpickle)
    pf = pickle.load(infd)
    infd.close()

    lcfile = os.path.join(lcbasedir, pf['lcfbasename'])
    objectid = pf['objectid']

    if 'kwargs' in pf:
        kwargs = pf['kwargs']
    else:
        kwargs = None

    # override the default timecols, magcols, and errcols
    # using the ones provided to the periodfinder
    # if those don't exist, use the defaults from the lcformat def
    if kwargs and 'timecols' in kwargs and timecols is None:
        timecols = kwargs['timecols']
    elif not kwargs and not timecols:
        timecols = dtimecols

    if kwargs and 'magcols' in kwargs and magcols is None:
        magcols = kwargs['magcols']
    elif not kwargs and not magcols:
        magcols = dmagcols

    if kwargs and 'errcols' in kwargs and errcols is None:
        errcols = kwargs['errcols']
    elif not kwargs and not errcols:
        errcols = derrcols

    # check if the light curve file exists
    if not os.path.exists(lcfile):
        LOGERROR("can't find LC %s for object %s" % (lcfile, objectid))
        return None

    # check if we have neighbors we can get the LCs for
    if starfeatures is not None and os.path.exists(starfeatures):

        with open(starfeatures, 'rb') as infd:
            starfeat = pickle.load(infd)

        if starfeat['closestnbrlcfname'].size > 0:

            nbr_full_lcf = starfeat['closestnbrlcfname'][0]

            # check for this LC in the lcbasedir
            if os.path.exists(
                    os.path.join(lcbasedir, os.path.basename(nbr_full_lcf))):
                nbrlcf = os.path.join(lcbasedir,
                                      os.path.basename(nbr_full_lcf))
            # if it's not there, check for this file at the full LC location
            elif os.path.exists(nbr_full_lcf):
                nbrlcf = nbr_full_lcf
            # otherwise, we can't find it, so complain
            else:
                LOGWARNING("can't find neighbor light curve file: %s in "
                           "its original directory: %s, or in this object's "
                           "lcbasedir: %s, skipping neighbor processing..." %
                           (os.path.basename(nbr_full_lcf),
                            os.path.dirname(nbr_full_lcf), lcbasedir))
                nbrlcf = None

        else:
            nbrlcf = None

    else:
        nbrlcf = None

    # now, start processing for periodic feature extraction
    try:

        # get the object LC into a dict
        lcdict = readerfunc(lcfile)

        # this should handle lists/tuples being returned by readerfunc
        # we assume that the first element is the actual lcdict
        # FIXME: figure out how to not need this assumption
        if ((isinstance(lcdict, (list, tuple)))
                and (isinstance(lcdict[0], dict))):
            lcdict = lcdict[0]

        # get the nbr object LC into a dict if there is one
        if nbrlcf is not None:

            nbrlcdict = readerfunc(nbrlcf)

            # this should handle lists/tuples being returned by readerfunc
            # we assume that the first element is the actual lcdict
            # FIXME: figure out how to not need this assumption
            if ((isinstance(nbrlcdict, (list, tuple)))
                    and (isinstance(nbrlcdict[0], dict))):
                nbrlcdict = nbrlcdict[0]

        # this will be the output file
        outfile = os.path.join(
            outdir,
            'periodicfeatures-%s.pkl' % squeeze(objectid).replace(' ', '-'))

        # normalize using the special function if specified
        if normfunc is not None:
            lcdict = normfunc(lcdict)

            if nbrlcf:
                nbrlcdict = normfunc(nbrlcdict)

        resultdict = {}

        for tcol, mcol, ecol in zip(timecols, magcols, errcols):

            # dereference the columns and get them from the lcdict
            if '.' in tcol:
                tcolget = tcol.split('.')
            else:
                tcolget = [tcol]
            times = _dict_get(lcdict, tcolget)

            if nbrlcf:
                nbrtimes = _dict_get(nbrlcdict, tcolget)
            else:
                nbrtimes = None

            if '.' in mcol:
                mcolget = mcol.split('.')
            else:
                mcolget = [mcol]

            mags = _dict_get(lcdict, mcolget)

            if nbrlcf:
                nbrmags = _dict_get(nbrlcdict, mcolget)
            else:
                nbrmags = None

            if '.' in ecol:
                ecolget = ecol.split('.')
            else:
                ecolget = [ecol]

            errs = _dict_get(lcdict, ecolget)

            if nbrlcf:
                nbrerrs = _dict_get(nbrlcdict, ecolget)
            else:
                nbrerrs = None

            #
            # filter out nans, etc. from the object and any neighbor LC
            #

            # get the finite values
            finind = np.isfinite(times) & np.isfinite(mags) & np.isfinite(errs)
            ftimes, fmags, ferrs = times[finind], mags[finind], errs[finind]

            if nbrlcf:

                nfinind = (np.isfinite(nbrtimes) & np.isfinite(nbrmags)
                           & np.isfinite(nbrerrs))
                nbrftimes, nbrfmags, nbrferrs = (nbrtimes[nfinind],
                                                 nbrmags[nfinind],
                                                 nbrerrs[nfinind])

            # get nonzero errors
            nzind = np.nonzero(ferrs)
            ftimes, fmags, ferrs = ftimes[nzind], fmags[nzind], ferrs[nzind]

            if nbrlcf:

                nnzind = np.nonzero(nbrferrs)
                nbrftimes, nbrfmags, nbrferrs = (nbrftimes[nnzind],
                                                 nbrfmags[nnzind],
                                                 nbrferrs[nnzind])

            # normalize here if not using special normalization
            if normfunc is None:

                ntimes, nmags = normalize_magseries(
                    ftimes, fmags, magsarefluxes=magsarefluxes)

                times, mags, errs = ntimes, nmags, ferrs

                if nbrlcf:
                    nbrntimes, nbrnmags = normalize_magseries(
                        nbrftimes, nbrfmags, magsarefluxes=magsarefluxes)
                    nbrtimes, nbrmags, nbrerrs = nbrntimes, nbrnmags, nbrferrs
                else:
                    nbrtimes, nbrmags, nbrerrs = None, None, None

            else:
                times, mags, errs = ftimes, fmags, ferrs

            if times.size > 999:

                #
                # now we have times, mags, errs (and nbrtimes, nbrmags, nbrerrs)
                #
                available_pfmethods = []
                available_pgrams = []
                available_bestperiods = []

                for k in pf[mcol].keys():

                    if k in PFMETHODS:

                        available_pgrams.append(pf[mcol][k])

                        if k != 'win':
                            available_pfmethods.append(pf[mcol][k]['method'])
                            available_bestperiods.append(
                                pf[mcol][k]['bestperiod'])

                #
                # process periodic features for this magcol
                #
                featkey = 'periodicfeatures-%s' % mcol
                resultdict[featkey] = {}

                # first, handle the periodogram features
                pgramfeat = periodicfeatures.periodogram_features(
                    available_pgrams,
                    times,
                    mags,
                    errs,
                    sigclip=sigclip,
                    pdiff_threshold=pdiff_threshold,
                    sidereal_threshold=sidereal_threshold,
                    sampling_peak_multiplier=sampling_peak_multiplier,
                    sampling_startp=sampling_startp,
                    sampling_endp=sampling_endp,
                    verbose=verbose)
                resultdict[featkey].update(pgramfeat)

                resultdict[featkey]['pfmethods'] = available_pfmethods

                # then for each bestperiod, get phasedlc and lcfit features
                for _ind, pfm, bp in zip(range(len(available_bestperiods)),
                                         available_pfmethods,
                                         available_bestperiods):

                    resultdict[featkey][pfm] = periodicfeatures.lcfit_features(
                        times,
                        mags,
                        errs,
                        bp,
                        fourierorder=fourierorder,
                        transitparams=transitparams,
                        ebparams=ebparams,
                        sigclip=sigclip,
                        magsarefluxes=magsarefluxes,
                        verbose=verbose)

                    phasedlcfeat = periodicfeatures.phasedlc_features(
                        times,
                        mags,
                        errs,
                        bp,
                        nbrtimes=nbrtimes,
                        nbrmags=nbrmags,
                        nbrerrs=nbrerrs)

                    resultdict[featkey][pfm].update(phasedlcfeat)

            else:

                LOGERROR('not enough finite measurements in magcol: %s, for '
                         'pfpickle: %s, skipping this magcol' %
                         (mcol, pfpickle))
                featkey = 'periodicfeatures-%s' % mcol
                resultdict[featkey] = None

        #
        # end of per magcol processing
        #
        # write resultdict to pickle
        outfile = os.path.join(
            outdir,
            'periodicfeatures-%s.pkl' % squeeze(objectid).replace(' ', '-'))
        with open(outfile, 'wb') as outfd:
            pickle.dump(resultdict, outfd, pickle.HIGHEST_PROTOCOL)

        return outfile

    except Exception:

        LOGEXCEPTION('failed to run for pf: %s, lcfile: %s' %
                     (pfpickle, lcfile))
        if raiseonfail:
            raise
        else:
            return None
Ejemplo n.º 6
0
def parallel_epd_lcdir(lcdir,
                       externalparams,
                       lcfileglob=None,
                       timecols=None,
                       magcols=None,
                       errcols=None,
                       lcformat='hat-sql',
                       lcformatdir=None,
                       epdsmooth_sigclip=3.0,
                       epdsmooth_windowsize=21,
                       epdsmooth_func=smooth_magseries_savgol,
                       epdsmooth_extraparams=None,
                       nworkers=NCPUS,
                       maxworkertasks=1000):
    '''This applies EPD in parallel to all LCs in a directory.

    Parameters
    ----------

    lcdir : str
        The light curve directory to process.

    externalparams : dict or None
        This is a dict that indicates which keys in the lcdict obtained from the
        lcfile correspond to the required external parameters. As with timecol,
        magcol, and errcol, these can be simple keys (e.g. 'rjd') or compound
        keys ('magaperture1.mags'). The dict should look something like::

          {'fsv':'<lcdict key>' array: S values for each observation,
           'fdv':'<lcdict key>' array: D values for each observation,
           'fkv':'<lcdict key>' array: K values for each observation,
           'xcc':'<lcdict key>' array: x coords for each observation,
           'ycc':'<lcdict key>' array: y coords for each observation,
           'bgv':'<lcdict key>' array: sky background for each observation,
           'bge':'<lcdict key>' array: sky background err for each observation,
           'iha':'<lcdict key>' array: hour angle for each observation,
           'izd':'<lcdict key>' array: zenith distance for each observation}

    lcfileglob : str or None
        A UNIX fileglob to use to select light curve files in `lcdir`. If this
        is not None, the value provided will override the default fileglob for
        your light curve format.

    timecols,magcols,errcols : lists of str
        The keys in the lcdict produced by your light curve reader function that
        correspond to the times, mags/fluxes, and associated measurement errors
        that will be used as inputs to the EPD process. If these are None, the
        default values for `timecols`, `magcols`, and `errcols` for your light
        curve format will be used here.

    lcformat : str
        This is the `formatkey` associated with your light curve format, which
        you previously passed in to the `lcproc.register_lcformat`
        function. This will be used to look up how to find and read the light
        curves specified in `basedir` or `use_list_of_filenames`.

    lcformatdir : str or None
        If this is provided, gives the path to a directory when you've stored
        your lcformat description JSONs, other than the usual directories lcproc
        knows to search for them in. Use this along with `lcformat` to specify
        an LC format JSON file that's not currently registered with lcproc.

    epdsmooth_sigclip : float or int or sequence of two floats/ints or None
        This specifies how to sigma-clip the input LC before fitting the EPD
        function to it.

        If a single float or int, a symmetric sigma-clip will be performed using
        the number provided as the sigma-multiplier to cut out from the input
        time-series.

        If a list of two ints/floats is provided, the function will perform an
        'asymmetric' sigma-clip. The first element in this list is the sigma
        value to use for fainter flux/mag values; the second element in this
        list is the sigma value to use for brighter flux/mag values. For
        example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma
        dimmings and greater than 3-sigma brightenings. Here the meaning of
        "dimming" and "brightening" is set by *physics* (not the magnitude
        system), which is why the `magsarefluxes` kwarg must be correctly set.

        If `sigclip` is None, no sigma-clipping will be performed, and the
        time-series (with non-finite elems removed) will be passed through to
        the output.

    epdsmooth_windowsize : int
        This is the number of LC points to smooth over to generate a smoothed
        light curve that will be used to fit the EPD function.

    epdsmooth_func : Python function
        This sets the smoothing filter function to use. A Savitsky-Golay filter
        is used to smooth the light curve by default. The functions that can be
        used with this kwarg are listed in `varbase.trends`. If you want to use
        your own function, it MUST have the following signature::

                def smoothfunc(mags_array, window_size, **extraparams)

        and return a numpy array of the same size as `mags_array` with the
        smoothed time-series. Any extra params can be provided using the
        `extraparams` dict.

    epdsmooth_extraparams : dict
        This is a dict of any extra filter params to supply to the smoothing
        function.

    nworkers : int
        The number of parallel workers to launch when processing the LCs.

    maxworkertasks : int
        The maximum number of tasks a parallel worker will complete before it is
        replaced with a new one (sometimes helps with memory-leaks).

    Returns
    -------

    dict
        Returns a dict organized by all the keys in the input `magcols` list,
        containing lists of EPD pickle light curves for that `magcol`.

    Notes
    -----

    - S -> measure of PSF sharpness (~1/sigma^2 sosmaller S = wider PSF)
    - D -> measure of PSF ellipticity in xy direction
    - K -> measure of PSF ellipticity in cross direction

    S, D, K are related to the PSF's variance and covariance, see eqn 30-33 in
    A. Pal's thesis: https://arxiv.org/abs/0906.3486

    '''

    try:
        formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir)
        if formatinfo:
            (fileglob, readerfunc, dtimecols, dmagcols, derrcols,
             magsarefluxes, normfunc) = formatinfo
        else:
            LOGERROR("can't figure out the light curve format")
            return None
    except Exception as e:
        LOGEXCEPTION("can't figure out the light curve format")
        return None

    # find all the files matching the lcglob in lcdir
    if lcfileglob is None:
        lcfileglob = fileglob

    lclist = sorted(glob.glob(os.path.join(lcdir, lcfileglob)))

    return parallel_epd_lclist(lclist,
                               externalparams,
                               timecols=timecols,
                               magcols=magcols,
                               errcols=errcols,
                               lcformat=lcformat,
                               epdsmooth_sigclip=epdsmooth_sigclip,
                               epdsmooth_windowsize=epdsmooth_windowsize,
                               epdsmooth_func=epdsmooth_func,
                               epdsmooth_extraparams=epdsmooth_extraparams,
                               nworkers=nworkers,
                               maxworkertasks=maxworkertasks)
Ejemplo n.º 7
0
def apply_epd_magseries(lcfile,
                        timecol,
                        magcol,
                        errcol,
                        externalparams,
                        lcformat='hat-sql',
                        lcformatdir=None,
                        epdsmooth_sigclip=3.0,
                        epdsmooth_windowsize=21,
                        epdsmooth_func=smooth_magseries_savgol,
                        epdsmooth_extraparams=None):
    '''This applies external parameter decorrelation (EPD) to a light curve.

    Parameters
    ----------

    lcfile : str
        The filename of the light curve file to process.

    timecol,magcol,errcol : str
        The keys in the lcdict produced by your light curve reader function that
        correspond to the times, mags/fluxes, and associated measurement errors
        that will be used as input to the EPD process.

    externalparams : dict or None
        This is a dict that indicates which keys in the lcdict obtained from the
        lcfile correspond to the required external parameters. As with timecol,
        magcol, and errcol, these can be simple keys (e.g. 'rjd') or compound
        keys ('magaperture1.mags'). The dict should look something like::

          {'fsv':'<lcdict key>' array: S values for each observation,
           'fdv':'<lcdict key>' array: D values for each observation,
           'fkv':'<lcdict key>' array: K values for each observation,
           'xcc':'<lcdict key>' array: x coords for each observation,
           'ycc':'<lcdict key>' array: y coords for each observation,
           'bgv':'<lcdict key>' array: sky background for each observation,
           'bge':'<lcdict key>' array: sky background err for each observation,
           'iha':'<lcdict key>' array: hour angle for each observation,
           'izd':'<lcdict key>' array: zenith distance for each observation}

        Alternatively, if these exact keys are already present in the lcdict,
        indicate this by setting externalparams to None.

    lcformat : str
        This is the `formatkey` associated with your light curve format, which
        you previously passed in to the `lcproc.register_lcformat`
        function. This will be used to look up how to find and read the light
        curves specified in `basedir` or `use_list_of_filenames`.

    lcformatdir : str or None
        If this is provided, gives the path to a directory when you've stored
        your lcformat description JSONs, other than the usual directories lcproc
        knows to search for them in. Use this along with `lcformat` to specify
        an LC format JSON file that's not currently registered with lcproc.

    epdsmooth_sigclip : float or int or sequence of two floats/ints or None
        This specifies how to sigma-clip the input LC before fitting the EPD
        function to it.

        If a single float or int, a symmetric sigma-clip will be performed using
        the number provided as the sigma-multiplier to cut out from the input
        time-series.

        If a list of two ints/floats is provided, the function will perform an
        'asymmetric' sigma-clip. The first element in this list is the sigma
        value to use for fainter flux/mag values; the second element in this
        list is the sigma value to use for brighter flux/mag values. For
        example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma
        dimmings and greater than 3-sigma brightenings. Here the meaning of
        "dimming" and "brightening" is set by *physics* (not the magnitude
        system), which is why the `magsarefluxes` kwarg must be correctly set.

        If `sigclip` is None, no sigma-clipping will be performed, and the
        time-series (with non-finite elems removed) will be passed through to
        the output.

    epdsmooth_windowsize : int
        This is the number of LC points to smooth over to generate a smoothed
        light curve that will be used to fit the EPD function.

    epdsmooth_func : Python function
        This sets the smoothing filter function to use. A Savitsky-Golay filter
        is used to smooth the light curve by default. The functions that can be
        used with this kwarg are listed in `varbase.trends`. If you want to use
        your own function, it MUST have the following signature::

                def smoothfunc(mags_array, window_size, **extraparams)

        and return a numpy array of the same size as `mags_array` with the
        smoothed time-series. Any extra params can be provided using the
        `extraparams` dict.

    epdsmooth_extraparams : dict
        This is a dict of any extra filter params to supply to the smoothing
        function.

    Returns
    -------

    str
        Writes the output EPD light curve to a pickle that contains the lcdict
        with an added `lcdict['epd']` key, which contains the EPD times,
        mags/fluxes, and errs as `lcdict['epd']['times']`,
        `lcdict['epd']['mags']`, and `lcdict['epd']['errs']`. Returns the
        filename of this generated EPD LC pickle file.

    Notes
    -----

    - S -> measure of PSF sharpness (~1/sigma^2 sosmaller S = wider PSF)
    - D -> measure of PSF ellipticity in xy direction
    - K -> measure of PSF ellipticity in cross direction

    S, D, K are related to the PSF's variance and covariance, see eqn 30-33 in
    A. Pal's thesis: https://arxiv.org/abs/0906.3486

    '''
    try:
        formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir)
        if formatinfo:
            (dfileglob, readerfunc, dtimecols, dmagcols, derrcols,
             magsarefluxes, normfunc) = formatinfo
        else:
            LOGERROR("can't figure out the light curve format")
            return None
    except Exception as e:
        LOGEXCEPTION("can't figure out the light curve format")
        return None

    lcdict = readerfunc(lcfile)
    if ((isinstance(lcdict, (tuple, list))) and isinstance(lcdict[0], dict)):
        lcdict = lcdict[0]

    objectid = lcdict['objectid']
    times, mags, errs = lcdict[timecol], lcdict[magcol], lcdict[errcol]

    if externalparams is not None:

        fsv = lcdict[externalparams['fsv']]
        fdv = lcdict[externalparams['fdv']]
        fkv = lcdict[externalparams['fkv']]

        xcc = lcdict[externalparams['xcc']]
        ycc = lcdict[externalparams['ycc']]

        bgv = lcdict[externalparams['bgv']]
        bge = lcdict[externalparams['bge']]

        iha = lcdict[externalparams['iha']]
        izd = lcdict[externalparams['izd']]

    else:

        fsv = lcdict['fsv']
        fdv = lcdict['fdv']
        fkv = lcdict['fkv']

        xcc = lcdict['xcc']
        ycc = lcdict['ycc']

        bgv = lcdict['bgv']
        bge = lcdict['bge']

        iha = lcdict['iha']
        izd = lcdict['izd']

    # apply the corrections for EPD
    epd = epd_magseries(times,
                        mags,
                        errs,
                        fsv,
                        fdv,
                        fkv,
                        xcc,
                        ycc,
                        bgv,
                        bge,
                        iha,
                        izd,
                        magsarefluxes=magsarefluxes,
                        epdsmooth_sigclip=epdsmooth_sigclip,
                        epdsmooth_windowsize=epdsmooth_windowsize,
                        epdsmooth_func=epdsmooth_func,
                        epdsmooth_extraparams=epdsmooth_extraparams)

    # save the EPD magseries to a pickle LC
    lcdict['epd'] = epd
    outfile = os.path.join(
        os.path.dirname(lcfile),
        '%s-epd-%s-pklc.pkl' % (squeeze(objectid).replace(' ', '-'), magcol))
    with open(outfile, 'wb') as outfd:
        pickle.dump(lcdict, outfd, protocol=pickle.HIGHEST_PROTOCOL)

    return outfile
Ejemplo n.º 8
0
def runpf(lcfile,
          outdir,
          timecols=None,
          magcols=None,
          errcols=None,
          lcformat='hat-sql',
          lcformatdir=None,
          pfmethods=('gls','pdm','mav','win'),
          pfkwargs=({},{},{},{}),
          sigclip=10.0,
          getblssnr=False,
          nworkers=NCPUS,
          minobservations=500,
          excludeprocessed=False,
          raiseonfail=False):
    '''This runs the period-finding for a single LC.

    Parameters
    ----------

    lcfile : str
        The light curve file to run period-finding on.

    outdir : str
        The output directory where the result pickle will go.

    timecols : list of str or None
        The timecol keys to use from the lcdict in calculating the features.

    magcols : list of str or None
        The magcol keys to use from the lcdict in calculating the features.

    errcols : list of str or None
        The errcol keys to use from the lcdict in calculating the features.

    lcformat : str
        This is the `formatkey` associated with your light curve format, which
        you previously passed in to the `lcproc.register_lcformat`
        function. This will be used to look up how to find and read the light
        curves specified in `basedir` or `use_list_of_filenames`.

    lcformatdir : str or None
        If this is provided, gives the path to a directory when you've stored
        your lcformat description JSONs, other than the usual directories lcproc
        knows to search for them in. Use this along with `lcformat` to specify
        an LC format JSON file that's not currently registered with lcproc.

    pfmethods : list of str
        This is a list of period finding methods to run. Each element is a
        string matching the keys of the `PFMETHODS` dict above. By default, this
        runs GLS, PDM, AoVMH, and the spectral window Lomb-Scargle periodogram.

    pfkwargs : list of dicts
        This is used to provide any special kwargs as dicts to each
        period-finding method function specified in `pfmethods`.

    sigclip : float or int or sequence of two floats/ints or None
        If a single float or int, a symmetric sigma-clip will be performed using
        the number provided as the sigma-multiplier to cut out from the input
        time-series.

        If a list of two ints/floats is provided, the function will perform an
        'asymmetric' sigma-clip. The first element in this list is the sigma
        value to use for fainter flux/mag values; the second element in this
        list is the sigma value to use for brighter flux/mag values. For
        example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma
        dimmings and greater than 3-sigma brightenings. Here the meaning of
        "dimming" and "brightening" is set by *physics* (not the magnitude
        system), which is why the `magsarefluxes` kwarg must be correctly set.

        If `sigclip` is None, no sigma-clipping will be performed, and the
        time-series (with non-finite elems removed) will be passed through to
        the output.

    getblssnr : bool
        If this is True and BLS is one of the methods specified in `pfmethods`,
        will also calculate the stats for each best period in the BLS results:
        transit depth, duration, ingress duration, refit period and epoch, and
        the SNR of the transit.

    nworkers : int
        The number of parallel period-finding workers to launch.

    minobservations : int
        The minimum number of finite LC points required to process a light
        curve.

    excludeprocessed : bool
        If this is True, light curves that have existing period-finding result
        pickles in `outdir` will not be processed.

        FIXME: currently, this uses a dumb method of excluding already-processed
        files. A smarter way to do this is to (i) generate a SHA512 cachekey
        based on a repr of `{'lcfile', 'timecols', 'magcols', 'errcols',
        'lcformat', 'pfmethods', 'sigclip', 'getblssnr', 'pfkwargs'}`, (ii) make
        sure all list kwargs in the dict are sorted, (iii) check if the output
        file has the same cachekey in its filename (last 8 chars of cachekey
        should work), so the result was processed in exactly the same way as
        specifed in the input to this function, and can therefore be
        ignored. Will implement this later.

    raiseonfail : bool
        If something fails and this is True, will raise an Exception instead of
        returning None at the end.

    Returns
    -------

    str
        The path to the output period-finding result pickle.

    '''

    try:
        formatinfo = get_lcformat(lcformat,
                                  use_lcformat_dir=lcformatdir)
        if formatinfo:
            (dfileglob, readerfunc,
             dtimecols, dmagcols, derrcols,
             magsarefluxes, normfunc) = formatinfo
        else:
            LOGERROR("can't figure out the light curve format")
            return None
    except Exception as e:
        LOGEXCEPTION("can't figure out the light curve format")
        return None

    # override the default timecols, magcols, and errcols
    # using the ones provided to the function
    if timecols is None:
        timecols = dtimecols
    if magcols is None:
        magcols = dmagcols
    if errcols is None:
        errcols = derrcols

    try:

        # get the LC into a dict
        lcdict = readerfunc(lcfile)

        # this should handle lists/tuples being returned by readerfunc
        # we assume that the first element is the actual lcdict
        # FIXME: figure out how to not need this assumption
        if ( (isinstance(lcdict, (list, tuple))) and
             (isinstance(lcdict[0], dict)) ):
            lcdict = lcdict[0]

        outfile = os.path.join(outdir, 'periodfinding-%s.pkl' %
                               squeeze(lcdict['objectid']).replace(' ', '-'))

        # if excludeprocessed is True, return the output file if it exists and
        # has a size that is at least 100 kilobytes (this should be enough to
        # contain the minimal results of this function).
        if excludeprocessed:

            test_outfile = os.path.exists(outfile)
            test_outfile_gz = os.path.exists(outfile+'.gz')

            if (test_outfile and os.stat(outfile).st_size > 102400):

                LOGWARNING('periodfinding result for %s already exists at %s, '
                           'skipping because excludeprocessed=True'
                           % (lcfile, outfile))
                return outfile

            elif (test_outfile_gz and os.stat(outfile+'.gz').st_size > 102400):

                LOGWARNING(
                    'gzipped periodfinding result for %s already '
                    'exists at %s, skipping because excludeprocessed=True'
                    % (lcfile, outfile+'.gz')
                )
                return outfile+'.gz'


        # this is the final returndict
        resultdict = {
            'objectid':lcdict['objectid'],
            'lcfbasename':os.path.basename(lcfile),
            'kwargs':{'timecols':timecols,
                      'magcols':magcols,
                      'errcols':errcols,
                      'lcformat':lcformat,
                      'lcformatdir':lcformatdir,
                      'pfmethods':pfmethods,
                      'pfkwargs':pfkwargs,
                      'sigclip':sigclip,
                      'getblssnr':getblssnr}
        }

        # normalize using the special function if specified
        if normfunc is not None:
            lcdict = normfunc(lcdict)

        for tcol, mcol, ecol in zip(timecols, magcols, errcols):

            # dereference the columns and get them from the lcdict
            if '.' in tcol:
                tcolget = tcol.split('.')
            else:
                tcolget = [tcol]
            times = _dict_get(lcdict, tcolget)

            if '.' in mcol:
                mcolget = mcol.split('.')
            else:
                mcolget = [mcol]
            mags = _dict_get(lcdict, mcolget)

            if '.' in ecol:
                ecolget = ecol.split('.')
            else:
                ecolget = [ecol]
            errs = _dict_get(lcdict, ecolget)


            # normalize here if not using special normalization
            if normfunc is None:
                ntimes, nmags = normalize_magseries(
                    times, mags,
                    magsarefluxes=magsarefluxes
                )

                times, mags, errs = ntimes, nmags, errs


            # run each of the requested period-finder functions
            resultdict[mcol] = {}

            # check if we have enough non-nan observations to proceed
            finmags = mags[np.isfinite(mags)]

            if finmags.size < minobservations:

                LOGERROR('not enough non-nan observations for '
                         'this LC. have: %s, required: %s, '
                         'magcol: %s, skipping...' %
                         (finmags.size, minobservations, mcol))
                continue

            pfmkeys = []

            for pfmind, pfm, pfkw in zip(range(len(pfmethods)),
                                         pfmethods,
                                         pfkwargs):

                pf_func = PFMETHODS[pfm]

                # get any optional kwargs for this function
                pf_kwargs = pfkw
                pf_kwargs.update({'verbose':False,
                                  'nworkers':nworkers,
                                  'magsarefluxes':magsarefluxes,
                                  'sigclip':sigclip})

                # we'll always prefix things with their index to allow multiple
                # invocations and results from the same period-finder (for
                # different period ranges, for example).
                pfmkey = '%s-%s' % (pfmind, pfm)
                pfmkeys.append(pfmkey)

                # run this period-finder and save its results to the output dict
                resultdict[mcol][pfmkey] = pf_func(
                    times, mags, errs,
                    **pf_kwargs
                )


            #
            # done with running the period finders
            #
            # append the pfmkeys list to the magcol dict
            resultdict[mcol]['pfmethods'] = pfmkeys

            # check if we need to get the SNR from any BLS pfresults
            if 'bls' in pfmethods and getblssnr:

                # we need to scan thru the pfmethods to get to any BLS pfresults
                for pfmk in resultdict[mcol]['pfmethods']:

                    if 'bls' in pfmk:

                        try:

                            bls = resultdict[mcol][pfmk]

                            # calculate the SNR for the BLS as well
                            blssnr = bls_snr(bls, times, mags, errs,
                                             magsarefluxes=magsarefluxes,
                                             verbose=False)

                            # add the SNR results to the BLS result dict
                            resultdict[mcol][pfmk].update({
                                'snr':blssnr['snr'],
                                'transitdepth':blssnr['transitdepth'],
                                'transitduration':blssnr['transitduration'],
                            })

                            # update the BLS result dict with the refit periods
                            # and epochs using the results from bls_snr
                            resultdict[mcol][pfmk].update({
                                'nbestperiods':blssnr['period'],
                                'epochs':blssnr['epoch']
                            })

                        except Exception as e:

                            LOGEXCEPTION('could not calculate BLS SNR for %s' %
                                         lcfile)
                            # add the SNR null results to the BLS result dict
                            resultdict[mcol][pfmk].update({
                                'snr':[np.nan,np.nan,np.nan,np.nan,np.nan],
                                'transitdepth':[np.nan,np.nan,np.nan,
                                                np.nan,np.nan],
                                'transitduration':[np.nan,np.nan,np.nan,
                                                   np.nan,np.nan],
                            })

            elif 'bls' in pfmethods:

                # we need to scan thru the pfmethods to get to any BLS pfresults
                for pfmk in resultdict[mcol]['pfmethods']:

                    if 'bls' in pfmk:

                        # add the SNR null results to the BLS result dict
                        resultdict[mcol][pfmk].update({
                            'snr':[np.nan,np.nan,np.nan,np.nan,np.nan],
                            'transitdepth':[np.nan,np.nan,np.nan,
                                            np.nan,np.nan],
                            'transitduration':[np.nan,np.nan,np.nan,
                                               np.nan,np.nan],
                        })


        # once all mag cols have been processed, write out the pickle
        with open(outfile, 'wb') as outfd:
            pickle.dump(resultdict, outfd, protocol=pickle.HIGHEST_PROTOCOL)

        return outfile

    except Exception as e:

        LOGEXCEPTION('failed to run for %s, because: %s' % (lcfile, e))

        if raiseonfail:
            raise

        return None
Ejemplo n.º 9
0
def parallel_pf_lcdir(lcdir,
                      outdir,
                      fileglob=None,
                      recursive=True,
                      timecols=None,
                      magcols=None,
                      errcols=None,
                      lcformat='hat-sql',
                      lcformatdir=None,
                      pfmethods=('gls','pdm','mav','win'),
                      pfkwargs=({},{},{},{}),
                      sigclip=10.0,
                      getblssnr=False,
                      nperiodworkers=NCPUS,
                      ncontrolworkers=1,
                      liststartindex=None,
                      listmaxobjects=None,
                      minobservations=500,
                      excludeprocessed=True):
    '''This runs parallel light curve period finding for directory of LCs.

    Parameters
    ----------

    lcdir : str
        The directory containing the LCs to process.

    outdir : str
        The directory where the resulting period-finding pickles will go.

    fileglob : str or None
        The UNIX file glob to use to search for LCs in `lcdir`. If None, the
        default file glob associated with the registered LC format will be used
        instead.

    recursive : bool
        If True, will search recursively in `lcdir` for light curves to process.

    timecols : list of str or None
        The timecol keys to use from the lcdict in calculating the features.

    magcols : list of str or None
        The magcol keys to use from the lcdict in calculating the features.

    errcols : list of str or None
        The errcol keys to use from the lcdict in calculating the features.

    lcformat : str
        This is the `formatkey` associated with your light curve format, which
        you previously passed in to the `lcproc.register_lcformat`
        function. This will be used to look up how to find and read the light
        curves specified in `basedir` or `use_list_of_filenames`.

    lcformatdir : str or None
        If this is provided, gives the path to a directory when you've stored
        your lcformat description JSONs, other than the usual directories lcproc
        knows to search for them in. Use this along with `lcformat` to specify
        an LC format JSON file that's not currently registered with lcproc.

    pfmethods : list of str
        This is a list of period finding methods to run. Each element is a
        string matching the keys of the `PFMETHODS` dict above. By default, this
        runs GLS, PDM, AoVMH, and the spectral window Lomb-Scargle periodogram.

    pfkwargs : list of dicts
        This is used to provide any special kwargs as dicts to each
        period-finding method function specified in `pfmethods`.

    sigclip : float or int or sequence of two floats/ints or None
        If a single float or int, a symmetric sigma-clip will be performed using
        the number provided as the sigma-multiplier to cut out from the input
        time-series.

        If a list of two ints/floats is provided, the function will perform an
        'asymmetric' sigma-clip. The first element in this list is the sigma
        value to use for fainter flux/mag values; the second element in this
        list is the sigma value to use for brighter flux/mag values. For
        example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma
        dimmings and greater than 3-sigma brightenings. Here the meaning of
        "dimming" and "brightening" is set by *physics* (not the magnitude
        system), which is why the `magsarefluxes` kwarg must be correctly set.

        If `sigclip` is None, no sigma-clipping will be performed, and the
        time-series (with non-finite elems removed) will be passed through to
        the output.

    getblssnr : bool
        If this is True and BLS is one of the methods specified in `pfmethods`,
        will also calculate the stats for each best period in the BLS results:
        transit depth, duration, ingress duration, refit period and epoch, and
        the SNR of the transit.

    nperiodworkers : int
        The number of parallel period-finding workers to launch per object task.

    ncontrolworkers : int
        The number of controlling processes to launch. This effectively sets how
        many objects from `lclist` will be processed in parallel.

    liststartindex : int or None
        This sets the index from where to start in `lclist`.

    listmaxobjects : int or None
        This sets the maximum number of objects in `lclist` to run
        period-finding for in this invocation. Together with `liststartindex`,
        `listmaxobjects` can be used to distribute processing over several
        independent machines if the number of light curves is very large.

    minobservations : int
        The minimum number of finite LC points required to process a light
        curve.

    excludeprocessed : bool
        If this is True, light curves that have existing period-finding result
        pickles in `outdir` will not be processed.

        FIXME: currently, this uses a dumb method of excluding already-processed
        files. A smarter way to do this is to (i) generate a SHA512 cachekey
        based on a repr of `{'lcfile', 'timecols', 'magcols', 'errcols',
        'lcformat', 'pfmethods', 'sigclip', 'getblssnr', 'pfkwargs'}`, (ii) make
        sure all list kwargs in the dict are sorted, (iii) check if the output
        file has the same cachekey in its filename (last 8 chars of cachekey
        should work), so the result was processed in exactly the same way as
        specifed in the input to this function, and can therefore be
        ignored. Will implement this later.

    Returns
    -------

    list of str
        A list of the period-finding pickles created for all of input LCs
        processed.

    '''

    try:
        formatinfo = get_lcformat(lcformat,
                                  use_lcformat_dir=lcformatdir)
        if formatinfo:
            (dfileglob, readerfunc,
             dtimecols, dmagcols, derrcols,
             magsarefluxes, normfunc) = formatinfo
        else:
            LOGERROR("can't figure out the light curve format")
            return None
    except Exception as e:
        LOGEXCEPTION("can't figure out the light curve format")
        return None

    if not fileglob:
        fileglob = dfileglob

    # now find the files
    LOGINFO('searching for %s light curves in %s ...' % (lcformat, lcdir))

    if recursive is False:
        matching = glob.glob(os.path.join(lcdir, fileglob))

    else:
        # use recursive glob for Python 3.5+
        if sys.version_info[:2] > (3,4):

            matching = glob.glob(os.path.join(lcdir,
                                              '**',
                                              fileglob),recursive=True)

        # otherwise, use os.walk and glob
        else:

            # use os.walk to go through the directories
            walker = os.walk(lcdir)
            matching = []

            for root, dirs, _files in walker:
                for sdir in dirs:
                    searchpath = os.path.join(root,
                                              sdir,
                                              fileglob)
                    foundfiles = glob.glob(searchpath)

                    if foundfiles:
                        matching.extend(foundfiles)


    # now that we have all the files, process them
    if matching and len(matching) > 0:

        # this helps us process things in deterministic order when we distribute
        # processing over several machines
        matching = sorted(matching)

        LOGINFO('found %s light curves, running pf...' % len(matching))

        return parallel_pf(matching,
                           outdir,
                           timecols=timecols,
                           magcols=magcols,
                           errcols=errcols,
                           lcformat=lcformat,
                           lcformatdir=lcformatdir,
                           pfmethods=pfmethods,
                           pfkwargs=pfkwargs,
                           getblssnr=getblssnr,
                           sigclip=sigclip,
                           nperiodworkers=nperiodworkers,
                           ncontrolworkers=ncontrolworkers,
                           liststartindex=liststartindex,
                           listmaxobjects=listmaxobjects,
                           minobservations=minobservations,
                           excludeprocessed=excludeprocessed)

    else:

        LOGERROR('no light curve files in %s format found in %s' % (lcformat,
                                                                    lcdir))
        return None
Ejemplo n.º 10
0
def plot_variability_thresholds(varthreshpkl,
                                xmin_lcmad_stdev=5.0,
                                xmin_stetj_stdev=2.0,
                                xmin_iqr_stdev=2.0,
                                xmin_inveta_stdev=2.0,
                                lcformat='hat-sql',
                                lcformatdir=None,
                                magcols=None):
    '''This makes plots for the variability threshold distributions.

    Parameters
    ----------

    varthreshpkl : str
        The pickle produced by the function above.

    xmin_lcmad_stdev,xmin_stetj_stdev,xmin_iqr_stdev,xmin_inveta_stdev : float or np.array
        Values of the threshold values to override the ones in the
        `vartresholdpkl`. If provided, will plot the thresholds accordingly
        instead of using the ones in the input pickle directly.

    lcformat : str
        This is the `formatkey` associated with your light curve format, which
        you previously passed in to the `lcproc.register_lcformat`
        function. This will be used to look up how to find and read the light
        curves specified in `basedir` or `use_list_of_filenames`.

    lcformatdir : str or None
        If this is provided, gives the path to a directory when you've stored
        your lcformat description JSONs, other than the usual directories lcproc
        knows to search for them in. Use this along with `lcformat` to specify
        an LC format JSON file that's not currently registered with lcproc.

    magcols : list of str or None
        The magcol keys to use from the lcdict.

    Returns
    -------

    str
        The file name of the threshold plot generated.

    '''

    try:
        formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir)
        if formatinfo:
            (dfileglob, readerfunc, dtimecols, dmagcols, derrcols,
             magsarefluxes, normfunc) = formatinfo
        else:
            LOGERROR("can't figure out the light curve format")
            return None
    except Exception:
        LOGEXCEPTION("can't figure out the light curve format")
        return None

    if magcols is None:
        magcols = dmagcols

    with open(varthreshpkl, 'rb') as infd:
        allobjects = pickle.load(infd)

    magbins = allobjects['magbins']

    for magcol in magcols:

        min_lcmad_stdev = (xmin_lcmad_stdev
                           or allobjects[magcol]['min_lcmad_stdev'])
        min_stetj_stdev = (xmin_stetj_stdev
                           or allobjects[magcol]['min_stetj_stdev'])
        min_iqr_stdev = (xmin_iqr_stdev or allobjects[magcol]['min_iqr_stdev'])
        min_inveta_stdev = (xmin_inveta_stdev
                            or allobjects[magcol]['min_inveta_stdev'])

        fig = plt.figure(figsize=(20, 16))

        # the mag vs lcmad
        plt.subplot(221)
        plt.plot(allobjects[magcol]['sdssr'],
                 allobjects[magcol]['lcmad'] * 1.483,
                 marker='.',
                 ms=1.0,
                 linestyle='none',
                 rasterized=True)
        plt.plot(allobjects[magcol]['binned_sdssr_median'],
                 np.array(allobjects[magcol]['binned_lcmad_median']) * 1.483,
                 linewidth=3.0)
        plt.plot(allobjects[magcol]['binned_sdssr_median'],
                 np.array(allobjects[magcol]['binned_lcmad_median']) * 1.483 +
                 min_lcmad_stdev *
                 np.array(allobjects[magcol]['binned_lcmad_stdev']),
                 linewidth=3.0,
                 linestyle='dashed')
        plt.xlim((magbins.min() - 0.25, magbins.max()))
        plt.xlabel('SDSS r')
        plt.ylabel(r'lightcurve RMS (MAD $\times$ 1.483)')
        plt.title('%s - SDSS r vs. light curve RMS' % magcol)
        plt.yscale('log')
        plt.tight_layout()

        # the mag vs stetsonj
        plt.subplot(222)
        plt.plot(allobjects[magcol]['sdssr'],
                 allobjects[magcol]['stetsonj'],
                 marker='.',
                 ms=1.0,
                 linestyle='none',
                 rasterized=True)
        plt.plot(allobjects[magcol]['binned_sdssr_median'],
                 allobjects[magcol]['binned_stetsonj_median'],
                 linewidth=3.0)
        plt.plot(allobjects[magcol]['binned_sdssr_median'],
                 np.array(allobjects[magcol]['binned_stetsonj_median']) +
                 min_stetj_stdev *
                 np.array(allobjects[magcol]['binned_stetsonj_stdev']),
                 linewidth=3.0,
                 linestyle='dashed')
        plt.xlim((magbins.min() - 0.25, magbins.max()))
        plt.xlabel('SDSS r')
        plt.ylabel('Stetson J index')
        plt.title('%s - SDSS r vs. Stetson J index' % magcol)
        plt.yscale('log')
        plt.tight_layout()

        # the mag vs IQR
        plt.subplot(223)
        plt.plot(allobjects[magcol]['sdssr'],
                 allobjects[magcol]['iqr'],
                 marker='.',
                 ms=1.0,
                 linestyle='none',
                 rasterized=True)
        plt.plot(allobjects[magcol]['binned_sdssr_median'],
                 allobjects[magcol]['binned_iqr_median'],
                 linewidth=3.0)
        plt.plot(
            allobjects[magcol]['binned_sdssr_median'],
            np.array(allobjects[magcol]['binned_iqr_median']) +
            min_iqr_stdev * np.array(allobjects[magcol]['binned_iqr_stdev']),
            linewidth=3.0,
            linestyle='dashed')
        plt.xlabel('SDSS r')
        plt.ylabel('IQR')
        plt.title('%s - SDSS r vs. IQR' % magcol)
        plt.xlim((magbins.min() - 0.25, magbins.max()))
        plt.yscale('log')
        plt.tight_layout()

        # the mag vs IQR
        plt.subplot(224)
        plt.plot(allobjects[magcol]['sdssr'],
                 allobjects[magcol]['inveta'],
                 marker='.',
                 ms=1.0,
                 linestyle='none',
                 rasterized=True)
        plt.plot(allobjects[magcol]['binned_sdssr_median'],
                 allobjects[magcol]['binned_inveta_median'],
                 linewidth=3.0)
        plt.plot(allobjects[magcol]['binned_sdssr_median'],
                 np.array(allobjects[magcol]['binned_inveta_median']) +
                 min_inveta_stdev *
                 np.array(allobjects[magcol]['binned_inveta_stdev']),
                 linewidth=3.0,
                 linestyle='dashed')
        plt.xlabel('SDSS r')
        plt.ylabel(r'$1/\eta$')
        plt.title(r'%s - SDSS r vs. $1/\eta$' % magcol)
        plt.xlim((magbins.min() - 0.25, magbins.max()))
        plt.yscale('log')
        plt.tight_layout()

        plt.savefig('varfeatures-%s-%s-distributions.png' %
                    (varthreshpkl, magcol),
                    bbox_inches='tight')
        plt.close('all')
Ejemplo n.º 11
0
def variability_threshold(featuresdir,
                          outfile,
                          magbins=DEFAULT_MAGBINS,
                          maxobjects=None,
                          timecols=None,
                          magcols=None,
                          errcols=None,
                          lcformat='hat-sql',
                          lcformatdir=None,
                          min_lcmad_stdev=5.0,
                          min_stetj_stdev=2.0,
                          min_iqr_stdev=2.0,
                          min_inveta_stdev=2.0,
                          verbose=True):
    '''This generates a list of objects with stetson J, IQR, and 1.0/eta
    above some threshold value to select them as potential variable stars.

    Use this to pare down the objects to review and put through
    period-finding. This does the thresholding per magnitude bin; this should be
    better than one single cut through the entire magnitude range. Set the
    magnitude bins using the magbins kwarg.

    FIXME: implement a voting classifier here. this will choose variables based
    on the thresholds in IQR, stetson, and inveta based on weighting carried
    over from the variability recovery sims.

    Parameters
    ----------

    featuresdir : str
        This is the directory containing variability feature pickles created by
        :py:func:`astrobase.lcproc.lcpfeatures.parallel_varfeatures` or similar.

    outfile : str
        This is the output pickle file that will contain all the threshold
        information.

    magbins : np.array of floats
        This sets the magnitude bins to use for calculating thresholds.

    maxobjects : int or None
        This is the number of objects to process. If None, all objects with
        feature pickles in `featuresdir` will be processed.

    timecols : list of str or None
        The timecol keys to use from the lcdict in calculating the thresholds.

    magcols : list of str or None
        The magcol keys to use from the lcdict in calculating the thresholds.

    errcols : list of str or None
        The errcol keys to use from the lcdict in calculating the thresholds.

    lcformat : str
        This is the `formatkey` associated with your light curve format, which
        you previously passed in to the `lcproc.register_lcformat`
        function. This will be used to look up how to find and read the light
        curves specified in `basedir` or `use_list_of_filenames`.

    lcformatdir : str or None
        If this is provided, gives the path to a directory when you've stored
        your lcformat description JSONs, other than the usual directories lcproc
        knows to search for them in. Use this along with `lcformat` to specify
        an LC format JSON file that's not currently registered with lcproc.

    min_lcmad_stdev,min_stetj_stdev,min_iqr_stdev,min_inveta_stdev : float or np.array
        These are all the standard deviation multiplier for the distributions of
        light curve standard deviation, Stetson J variability index, the light
        curve interquartile range, and 1/eta variability index
        respectively. These multipliers set the minimum values of these measures
        to use for selecting variable stars. If provided as floats, the same
        value will be used for all magbins. If provided as np.arrays of `size =
        magbins.size - 1`, will be used to apply possibly different sigma cuts
        for each magbin.

    verbose : bool
        If True, will report progress and warn about any problems.

    Returns
    -------

    dict
        Contains all of the variability threshold information along with indices
        into the array of the object IDs chosen as variables.

    '''
    try:
        formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir)
        if formatinfo:
            (dfileglob, readerfunc, dtimecols, dmagcols, derrcols,
             magsarefluxes, normfunc) = formatinfo
        else:
            LOGERROR("can't figure out the light curve format")
            return None
    except Exception:
        LOGEXCEPTION("can't figure out the light curve format")
        return None

    # override the default timecols, magcols, and errcols
    # using the ones provided to the function
    if timecols is None:
        timecols = dtimecols
    if magcols is None:
        magcols = dmagcols
    if errcols is None:
        errcols = derrcols

    # list of input pickles generated by varfeatures functions above
    pklist = glob.glob(os.path.join(featuresdir, 'varfeatures-*.pkl'))

    if maxobjects:
        pklist = pklist[:maxobjects]

    allobjects = {}

    for magcol in magcols:

        # keep local copies of these so we can fix them independently in case of
        # nans
        if (isinstance(min_stetj_stdev, list)
                or isinstance(min_stetj_stdev, np.ndarray)):
            magcol_min_stetj_stdev = min_stetj_stdev[::]
        else:
            magcol_min_stetj_stdev = min_stetj_stdev

        if (isinstance(min_iqr_stdev, list)
                or isinstance(min_iqr_stdev, np.ndarray)):
            magcol_min_iqr_stdev = min_iqr_stdev[::]
        else:
            magcol_min_iqr_stdev = min_iqr_stdev

        if (isinstance(min_inveta_stdev, list)
                or isinstance(min_inveta_stdev, np.ndarray)):
            magcol_min_inveta_stdev = min_inveta_stdev[::]
        else:
            magcol_min_inveta_stdev = min_inveta_stdev

        LOGINFO('getting all object sdssr, LC MAD, stet J, IQR, eta...')

        # we'll calculate the sigma per magnitude bin, so get the mags as well
        allobjects[magcol] = {
            'objectid': [],
            'sdssr': [],
            'lcmad': [],
            'stetsonj': [],
            'iqr': [],
            'eta': []
        }

        # fancy progress bar with tqdm if present
        if TQDM and verbose:
            listiterator = tqdm(pklist)
        else:
            listiterator = pklist

        for pkl in listiterator:

            with open(pkl, 'rb') as infd:
                thisfeatures = pickle.load(infd)

            objectid = thisfeatures['objectid']

            # the object magnitude
            if ('info' in thisfeatures and thisfeatures['info']
                    and 'sdssr' in thisfeatures['info']):

                if (thisfeatures['info']['sdssr']
                        and thisfeatures['info']['sdssr'] > 3.0):

                    sdssr = thisfeatures['info']['sdssr']

                elif (magcol in thisfeatures and thisfeatures[magcol]
                      and 'median' in thisfeatures[magcol]
                      and thisfeatures[magcol]['median'] > 3.0):

                    sdssr = thisfeatures[magcol]['median']

                elif (thisfeatures['info']['jmag']
                      and thisfeatures['info']['hmag']
                      and thisfeatures['info']['kmag']):

                    sdssr = jhk_to_sdssr(thisfeatures['info']['jmag'],
                                         thisfeatures['info']['hmag'],
                                         thisfeatures['info']['kmag'])

                else:
                    sdssr = np.nan

            else:
                sdssr = np.nan

            # the MAD of the light curve
            if (magcol in thisfeatures and thisfeatures[magcol]
                    and thisfeatures[magcol]['mad']):
                lcmad = thisfeatures[magcol]['mad']
            else:
                lcmad = np.nan

            # stetson index
            if (magcol in thisfeatures and thisfeatures[magcol]
                    and thisfeatures[magcol]['stetsonj']):
                stetsonj = thisfeatures[magcol]['stetsonj']
            else:
                stetsonj = np.nan

            # IQR
            if (magcol in thisfeatures and thisfeatures[magcol]
                    and thisfeatures[magcol]['mag_iqr']):
                iqr = thisfeatures[magcol]['mag_iqr']
            else:
                iqr = np.nan

            # eta
            if (magcol in thisfeatures and thisfeatures[magcol]
                    and thisfeatures[magcol]['eta_normal']):
                eta = thisfeatures[magcol]['eta_normal']
            else:
                eta = np.nan

            allobjects[magcol]['objectid'].append(objectid)
            allobjects[magcol]['sdssr'].append(sdssr)
            allobjects[magcol]['lcmad'].append(lcmad)
            allobjects[magcol]['stetsonj'].append(stetsonj)
            allobjects[magcol]['iqr'].append(iqr)
            allobjects[magcol]['eta'].append(eta)

        #
        # done with collection of info
        #
        LOGINFO('finding objects above thresholds per magbin...')

        # turn the info into arrays
        allobjects[magcol]['objectid'] = np.ravel(
            np.array(allobjects[magcol]['objectid']))
        allobjects[magcol]['sdssr'] = np.ravel(
            np.array(allobjects[magcol]['sdssr']))
        allobjects[magcol]['lcmad'] = np.ravel(
            np.array(allobjects[magcol]['lcmad']))
        allobjects[magcol]['stetsonj'] = np.ravel(
            np.array(allobjects[magcol]['stetsonj']))
        allobjects[magcol]['iqr'] = np.ravel(
            np.array(allobjects[magcol]['iqr']))
        allobjects[magcol]['eta'] = np.ravel(
            np.array(allobjects[magcol]['eta']))

        # only get finite elements everywhere
        thisfinind = (np.isfinite(allobjects[magcol]['sdssr'])
                      & np.isfinite(allobjects[magcol]['lcmad'])
                      & np.isfinite(allobjects[magcol]['stetsonj'])
                      & np.isfinite(allobjects[magcol]['iqr'])
                      & np.isfinite(allobjects[magcol]['eta']))
        allobjects[magcol]['objectid'] = allobjects[magcol]['objectid'][
            thisfinind]
        allobjects[magcol]['sdssr'] = allobjects[magcol]['sdssr'][thisfinind]
        allobjects[magcol]['lcmad'] = allobjects[magcol]['lcmad'][thisfinind]
        allobjects[magcol]['stetsonj'] = allobjects[magcol]['stetsonj'][
            thisfinind]
        allobjects[magcol]['iqr'] = allobjects[magcol]['iqr'][thisfinind]
        allobjects[magcol]['eta'] = allobjects[magcol]['eta'][thisfinind]

        # invert eta so we can threshold the same way as the others
        allobjects[magcol]['inveta'] = 1.0 / allobjects[magcol]['eta']

        # do the thresholding by magnitude bin
        magbininds = np.digitize(allobjects[magcol]['sdssr'], magbins)

        binned_objectids = []
        binned_sdssr = []
        binned_sdssr_median = []

        binned_lcmad = []
        binned_stetsonj = []
        binned_iqr = []
        binned_inveta = []
        binned_count = []

        binned_objectids_thresh_stetsonj = []
        binned_objectids_thresh_iqr = []
        binned_objectids_thresh_inveta = []
        binned_objectids_thresh_all = []

        binned_lcmad_median = []
        binned_lcmad_stdev = []

        binned_stetsonj_median = []
        binned_stetsonj_stdev = []

        binned_inveta_median = []
        binned_inveta_stdev = []

        binned_iqr_median = []
        binned_iqr_stdev = []

        # go through all the mag bins and get the thresholds for J, inveta, IQR
        for mbinind, magi in zip(np.unique(magbininds),
                                 range(len(magbins) - 1)):

            thisbinind = np.where(magbininds == mbinind)
            thisbin_sdssr_median = (magbins[magi] + magbins[magi + 1]) / 2.0
            binned_sdssr_median.append(thisbin_sdssr_median)

            thisbin_objectids = allobjects[magcol]['objectid'][thisbinind]
            thisbin_sdssr = allobjects[magcol]['sdssr'][thisbinind]
            thisbin_lcmad = allobjects[magcol]['lcmad'][thisbinind]
            thisbin_stetsonj = allobjects[magcol]['stetsonj'][thisbinind]
            thisbin_iqr = allobjects[magcol]['iqr'][thisbinind]
            thisbin_inveta = allobjects[magcol]['inveta'][thisbinind]
            thisbin_count = thisbin_objectids.size

            if thisbin_count > 4:

                thisbin_lcmad_median = np.median(thisbin_lcmad)
                thisbin_lcmad_stdev = np.median(
                    np.abs(thisbin_lcmad - thisbin_lcmad_median)) * 1.483
                binned_lcmad_median.append(thisbin_lcmad_median)
                binned_lcmad_stdev.append(thisbin_lcmad_stdev)

                thisbin_stetsonj_median = np.median(thisbin_stetsonj)
                thisbin_stetsonj_stdev = np.median(
                    np.abs(thisbin_stetsonj - thisbin_stetsonj_median)) * 1.483
                binned_stetsonj_median.append(thisbin_stetsonj_median)
                binned_stetsonj_stdev.append(thisbin_stetsonj_stdev)

                # now get the objects above the required stdev threshold
                if isinstance(magcol_min_stetj_stdev, float):

                    thisbin_objectids_thresh_stetsonj = thisbin_objectids[
                        thisbin_stetsonj > (
                            thisbin_stetsonj_median +
                            magcol_min_stetj_stdev * thisbin_stetsonj_stdev)]

                elif (isinstance(magcol_min_stetj_stdev, np.ndarray)
                      or isinstance(magcol_min_stetj_stdev, list)):

                    thisbin_min_stetj_stdev = magcol_min_stetj_stdev[magi]

                    if not np.isfinite(thisbin_min_stetj_stdev):
                        LOGWARNING('provided threshold stetson J stdev '
                                   'for magbin: %.3f is nan, using 2.0' %
                                   thisbin_sdssr_median)
                        thisbin_min_stetj_stdev = 2.0
                        # update the input list/array as well, since we'll be
                        # saving it to the output dict and using it to plot the
                        # variability thresholds
                        magcol_min_stetj_stdev[magi] = 2.0

                    thisbin_objectids_thresh_stetsonj = thisbin_objectids[
                        thisbin_stetsonj > (
                            thisbin_stetsonj_median +
                            thisbin_min_stetj_stdev * thisbin_stetsonj_stdev)]

                thisbin_iqr_median = np.median(thisbin_iqr)
                thisbin_iqr_stdev = np.median(
                    np.abs(thisbin_iqr - thisbin_iqr_median)) * 1.483
                binned_iqr_median.append(thisbin_iqr_median)
                binned_iqr_stdev.append(thisbin_iqr_stdev)

                # get the objects above the required stdev threshold
                if isinstance(magcol_min_iqr_stdev, float):

                    thisbin_objectids_thresh_iqr = thisbin_objectids[
                        thisbin_iqr > (
                            thisbin_iqr_median +
                            magcol_min_iqr_stdev * thisbin_iqr_stdev)]

                elif (isinstance(magcol_min_iqr_stdev, np.ndarray)
                      or isinstance(magcol_min_iqr_stdev, list)):

                    thisbin_min_iqr_stdev = magcol_min_iqr_stdev[magi]

                    if not np.isfinite(thisbin_min_iqr_stdev):
                        LOGWARNING('provided threshold IQR stdev '
                                   'for magbin: %.3f is nan, using 2.0' %
                                   thisbin_sdssr_median)
                        thisbin_min_iqr_stdev = 2.0
                        # update the input list/array as well, since we'll be
                        # saving it to the output dict and using it to plot the
                        # variability thresholds
                        magcol_min_iqr_stdev[magi] = 2.0

                    thisbin_objectids_thresh_iqr = thisbin_objectids[
                        thisbin_iqr > (
                            thisbin_iqr_median +
                            thisbin_min_iqr_stdev * thisbin_iqr_stdev)]

                thisbin_inveta_median = np.median(thisbin_inveta)
                thisbin_inveta_stdev = np.median(
                    np.abs(thisbin_inveta - thisbin_inveta_median)) * 1.483
                binned_inveta_median.append(thisbin_inveta_median)
                binned_inveta_stdev.append(thisbin_inveta_stdev)

                if isinstance(magcol_min_inveta_stdev, float):

                    thisbin_objectids_thresh_inveta = thisbin_objectids[
                        thisbin_inveta > (
                            thisbin_inveta_median +
                            magcol_min_inveta_stdev * thisbin_inveta_stdev)]

                elif (isinstance(magcol_min_inveta_stdev, np.ndarray)
                      or isinstance(magcol_min_inveta_stdev, list)):

                    thisbin_min_inveta_stdev = magcol_min_inveta_stdev[magi]

                    if not np.isfinite(thisbin_min_inveta_stdev):
                        LOGWARNING('provided threshold inveta stdev '
                                   'for magbin: %.3f is nan, using 2.0' %
                                   thisbin_sdssr_median)

                        thisbin_min_inveta_stdev = 2.0
                        # update the input list/array as well, since we'll be
                        # saving it to the output dict and using it to plot the
                        # variability thresholds
                        magcol_min_inveta_stdev[magi] = 2.0

                    thisbin_objectids_thresh_inveta = thisbin_objectids[
                        thisbin_inveta > (
                            thisbin_inveta_median +
                            thisbin_min_inveta_stdev * thisbin_inveta_stdev)]

            else:

                thisbin_objectids_thresh_stetsonj = (np.array(
                    [], dtype=np.unicode_))
                thisbin_objectids_thresh_iqr = (np.array([],
                                                         dtype=np.unicode_))
                thisbin_objectids_thresh_inveta = (np.array([],
                                                            dtype=np.unicode_))

            #
            # done with check for enough objects in the bin
            #

            # get the intersection of all threshold objects to get objects that
            # lie above the threshold for all variable indices
            thisbin_objectids_thresh_all = reduce(
                np.intersect1d, (thisbin_objectids_thresh_stetsonj,
                                 thisbin_objectids_thresh_iqr,
                                 thisbin_objectids_thresh_inveta))

            binned_objectids.append(thisbin_objectids)
            binned_sdssr.append(thisbin_sdssr)
            binned_lcmad.append(thisbin_lcmad)
            binned_stetsonj.append(thisbin_stetsonj)
            binned_iqr.append(thisbin_iqr)
            binned_inveta.append(thisbin_inveta)
            binned_count.append(thisbin_objectids.size)

            binned_objectids_thresh_stetsonj.append(
                thisbin_objectids_thresh_stetsonj)
            binned_objectids_thresh_iqr.append(thisbin_objectids_thresh_iqr)
            binned_objectids_thresh_inveta.append(
                thisbin_objectids_thresh_inveta)
            binned_objectids_thresh_all.append(thisbin_objectids_thresh_all)

        #
        # done with magbins
        #

        # update the output dict for this magcol
        allobjects[magcol]['magbins'] = magbins
        allobjects[magcol]['binned_objectids'] = binned_objectids
        allobjects[magcol]['binned_sdssr_median'] = binned_sdssr_median
        allobjects[magcol]['binned_sdssr'] = binned_sdssr
        allobjects[magcol]['binned_count'] = binned_count

        allobjects[magcol]['binned_lcmad'] = binned_lcmad
        allobjects[magcol]['binned_lcmad_median'] = binned_lcmad_median
        allobjects[magcol]['binned_lcmad_stdev'] = binned_lcmad_stdev

        allobjects[magcol]['binned_stetsonj'] = binned_stetsonj
        allobjects[magcol]['binned_stetsonj_median'] = binned_stetsonj_median
        allobjects[magcol]['binned_stetsonj_stdev'] = binned_stetsonj_stdev

        allobjects[magcol]['binned_iqr'] = binned_iqr
        allobjects[magcol]['binned_iqr_median'] = binned_iqr_median
        allobjects[magcol]['binned_iqr_stdev'] = binned_iqr_stdev

        allobjects[magcol]['binned_inveta'] = binned_inveta
        allobjects[magcol]['binned_inveta_median'] = binned_inveta_median
        allobjects[magcol]['binned_inveta_stdev'] = binned_inveta_stdev

        allobjects[magcol]['binned_objectids_thresh_stetsonj'] = (
            binned_objectids_thresh_stetsonj)
        allobjects[magcol]['binned_objectids_thresh_iqr'] = (
            binned_objectids_thresh_iqr)
        allobjects[magcol]['binned_objectids_thresh_inveta'] = (
            binned_objectids_thresh_inveta)
        allobjects[magcol]['binned_objectids_thresh_all'] = (
            binned_objectids_thresh_all)

        # get the common selected objects thru all measures
        try:
            allobjects[magcol]['objectids_all_thresh_all_magbins'] = np.unique(
                np.concatenate(
                    allobjects[magcol]['binned_objectids_thresh_all']))
        except ValueError:
            LOGWARNING('not enough variable objects matching all thresholds')
            allobjects[magcol]['objectids_all_thresh_all_magbins'] = (np.array(
                []))

        allobjects[magcol][
            'objectids_stetsonj_thresh_all_magbins'] = np.unique(
                np.concatenate(
                    allobjects[magcol]['binned_objectids_thresh_stetsonj']))
        allobjects[magcol]['objectids_inveta_thresh_all_magbins'] = np.unique(
            np.concatenate(
                allobjects[magcol]['binned_objectids_thresh_inveta']))
        allobjects[magcol]['objectids_iqr_thresh_all_magbins'] = np.unique(
            np.concatenate(allobjects[magcol]['binned_objectids_thresh_iqr']))

        # turn these into np.arrays for easier plotting if they're lists
        if isinstance(min_stetj_stdev, list):
            allobjects[magcol]['min_stetj_stdev'] = np.array(
                magcol_min_stetj_stdev)
        else:
            allobjects[magcol]['min_stetj_stdev'] = magcol_min_stetj_stdev

        if isinstance(min_iqr_stdev, list):
            allobjects[magcol]['min_iqr_stdev'] = np.array(
                magcol_min_iqr_stdev)
        else:
            allobjects[magcol]['min_iqr_stdev'] = magcol_min_iqr_stdev

        if isinstance(min_inveta_stdev, list):
            allobjects[magcol]['min_inveta_stdev'] = np.array(
                magcol_min_inveta_stdev)
        else:
            allobjects[magcol]['min_inveta_stdev'] = magcol_min_inveta_stdev

        # this one doesn't get touched (for now)
        allobjects[magcol]['min_lcmad_stdev'] = min_lcmad_stdev

    #
    # done with all magcols
    #

    allobjects['magbins'] = magbins

    with open(outfile, 'wb') as outfd:
        pickle.dump(allobjects, outfd, protocol=pickle.HIGHEST_PROTOCOL)

    return allobjects
Ejemplo n.º 12
0
def timebinlc(lcfile,
              binsizesec,
              outdir=None,
              lcformat='hat-sql',
              lcformatdir=None,
              timecols=None,
              magcols=None,
              errcols=None,
              minbinelems=7):
    '''This bins the given light curve file in time using the specified bin size.

    Parameters
    ----------

    lcfile : str
        The file name to process.

    binsizesec : float
        The time bin-size in seconds.

    outdir : str or None
        If this is a str, the output LC will be written to `outdir`. If this is
        None, the output LC will be written to the same directory as `lcfile`.

    lcformat : str
        This is the `formatkey` associated with your light curve format, which
        you previously passed in to the `lcproc.register_lcformat`
        function. This will be used to look up how to find and read the light
        curve file.

    lcformatdir : str or None
        If this is provided, gives the path to a directory when you've stored
        your lcformat description JSONs, other than the usual directories lcproc
        knows to search for them in. Use this along with `lcformat` to specify
        an LC format JSON file that's not currently registered with lcproc.

    timecols,magcols,errcols : lists of str
        The keys in the lcdict produced by your light curve reader function that
        correspond to the times, mags/fluxes, and associated measurement errors
        that will be used as inputs to the binning process. If these are None,
        the default values for `timecols`, `magcols`, and `errcols` for your
        light curve format will be used here.

    minbinelems : int
        The minimum number of time-bin elements required to accept a time-bin as
        valid for the output binned light curve.

    Returns
    -------

    str
        The name of the output pickle file with the binned LC.

        Writes the output binned light curve to a pickle that contains the
        lcdict with an added `lcdict['binned'][magcol]` key, which contains the
        binned times, mags/fluxes, and errs as
        `lcdict['binned'][magcol]['times']`, `lcdict['binned'][magcol]['mags']`,
        and `lcdict['epd'][magcol]['errs']` for each `magcol` provided in the
        input or default `magcols` value for this light curve format.

    '''

    try:
        formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir)
        if formatinfo:
            (dfileglob, readerfunc, dtimecols, dmagcols, derrcols,
             magsarefluxes, normfunc) = formatinfo
        else:
            LOGERROR("can't figure out the light curve format")
            return None
    except Exception as e:
        LOGEXCEPTION("can't figure out the light curve format")
        return None

    # override the default timecols, magcols, and errcols
    # using the ones provided to the function
    if timecols is None:
        timecols = dtimecols
    if magcols is None:
        magcols = dmagcols
    if errcols is None:
        errcols = derrcols

    # get the LC into a dict
    lcdict = readerfunc(lcfile)

    # this should handle lists/tuples being returned by readerfunc
    # we assume that the first element is the actual lcdict
    # FIXME: figure out how to not need this assumption
    if ((isinstance(lcdict, (list, tuple))) and (isinstance(lcdict[0], dict))):
        lcdict = lcdict[0]

    # skip already binned light curves
    if 'binned' in lcdict:
        LOGERROR('this light curve appears to be binned already, skipping...')
        return None

    lcdict['binned'] = {}

    for tcol, mcol, ecol in zip(timecols, magcols, errcols):

        # dereference the columns and get them from the lcdict
        if '.' in tcol:
            tcolget = tcol.split('.')
        else:
            tcolget = [tcol]
        times = _dict_get(lcdict, tcolget)

        if '.' in mcol:
            mcolget = mcol.split('.')
        else:
            mcolget = [mcol]
        mags = _dict_get(lcdict, mcolget)

        if '.' in ecol:
            ecolget = ecol.split('.')
        else:
            ecolget = [ecol]
        errs = _dict_get(lcdict, ecolget)

        # normalize here if not using special normalization
        if normfunc is None:
            ntimes, nmags = normalize_magseries(times,
                                                mags,
                                                magsarefluxes=magsarefluxes)

            times, mags, errs = ntimes, nmags, errs

        # now bin the mag series as requested
        binned = time_bin_magseries_with_errs(times,
                                              mags,
                                              errs,
                                              binsize=binsizesec,
                                              minbinelems=minbinelems)

        # put this into the special binned key of the lcdict
        lcdict['binned'][mcol] = {
            'times': binned['binnedtimes'],
            'mags': binned['binnedmags'],
            'errs': binned['binnederrs'],
            'nbins': binned['nbins'],
            'timebins': binned['jdbins'],
            'binsizesec': binsizesec
        }

    # done with binning for all magcols, now generate the output file
    # this will always be a pickle

    if outdir is None:
        outdir = os.path.dirname(lcfile)

    outfile = os.path.join(
        outdir, '%s-binned%.1fsec-%s.pkl' %
        (squeeze(lcdict['objectid']).replace(' ', '-'), binsizesec, lcformat))

    with open(outfile, 'wb') as outfd:
        pickle.dump(lcdict, outfd, protocol=pickle.HIGHEST_PROTOCOL)

    return outfile
Ejemplo n.º 13
0
def parallel_timebin_lcdir(lcdir,
                           binsizesec,
                           maxobjects=None,
                           outdir=None,
                           lcformat='hat-sql',
                           lcformatdir=None,
                           timecols=None,
                           magcols=None,
                           errcols=None,
                           minbinelems=7,
                           nworkers=NCPUS,
                           maxworkertasks=1000):
    '''
    This time bins all the light curves in the specified directory.

    Parameters
    ----------

    lcdir : list of str
        Directory containing the input LCs to process.

    binsizesec : float
        The time bin size to use in seconds.

    maxobjects : int or None
        If provided, LC processing will stop at `lclist[maxobjects]`.

    outdir : str or None
        The directory where output LCs will be written. If None, will write to
        the same directory as the input LCs.

    lcformat : str
        This is the `formatkey` associated with your light curve format, which
        you previously passed in to the `lcproc.register_lcformat`
        function. This will be used to look up how to find and read the light
        curve file.

    lcformatdir : str or None
        If this is provided, gives the path to a directory when you've stored
        your lcformat description JSONs, other than the usual directories lcproc
        knows to search for them in. Use this along with `lcformat` to specify
        an LC format JSON file that's not currently registered with lcproc.

    timecols,magcols,errcols : lists of str
        The keys in the lcdict produced by your light curve reader function that
        correspond to the times, mags/fluxes, and associated measurement errors
        that will be used as inputs to the binning process. If these are None,
        the default values for `timecols`, `magcols`, and `errcols` for your
        light curve format will be used here.

    minbinelems : int
        The minimum number of time-bin elements required to accept a time-bin as
        valid for the output binned light curve.

    nworkers : int
        Number of parallel workers to launch.

    maxworkertasks : int
        The maximum number of tasks a parallel worker will complete before being
        replaced to guard against memory leaks.

    Returns
    -------

    dict
        The returned dict contains keys = input LCs, vals = output LCs.

    '''
    try:
        formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir)
        if formatinfo:
            (fileglob, readerfunc, dtimecols, dmagcols, derrcols,
             magsarefluxes, normfunc) = formatinfo
        else:
            LOGERROR("can't figure out the light curve format")
            return None
    except Exception as e:
        LOGEXCEPTION("can't figure out the light curve format")
        return None

    lclist = sorted(glob.glob(os.path.join(lcdir, fileglob)))

    return parallel_timebin(lclist,
                            binsizesec,
                            maxobjects=maxobjects,
                            outdir=outdir,
                            lcformat=lcformat,
                            timecols=timecols,
                            magcols=magcols,
                            errcols=errcols,
                            minbinelems=minbinelems,
                            nworkers=nworkers,
                            maxworkertasks=maxworkertasks)
Ejemplo n.º 14
0
def get_starfeatures(lcfile,
                     outdir,
                     kdtree,
                     objlist,
                     lcflist,
                     neighbor_radius_arcsec,
                     deredden=True,
                     custom_bandpasses=None,
                     lcformat='hat-sql',
                     lcformatdir=None):
    '''This runs the functions from :py:func:`astrobase.varclass.starfeatures`
    on a single light curve file.

    Parameters
    ----------

    lcfile : str
        This is the LC file to extract star features for.

    outdir : str
        This is the directory to write the output pickle to.

    kdtree: scipy.spatial.cKDTree
        This is a `scipy.spatial.KDTree` or `cKDTree` used to calculate neighbor
        proximity features. This is for the light curve catalog this object is
        in.

    objlist : np.array
        This is a Numpy array of object IDs in the same order as the
        `kdtree.data` np.array. This is for the light curve catalog this object
        is in.

    lcflist : np.array
        This is a Numpy array of light curve filenames in the same order as
        `kdtree.data`. This is for the light curve catalog this object is in.

    neighbor_radius_arcsec : float
        This indicates the radius in arcsec to search for neighbors for this
        object using the light curve catalog's `kdtree`, `objlist`, `lcflist`,
        and in GAIA.

    deredden : bool
        This controls if the colors and any color classifications will be
        dereddened using 2MASS DUST.

    custom_bandpasses : dict or None
        This is a dict used to define any custom bandpasses in the
        `in_objectinfo` dict you want to make this function aware of and
        generate colors for. Use the format below for this dict::

            {
            '<bandpass_key_1>':{'dustkey':'<twomass_dust_key_1>',
                                'label':'<band_label_1>'
                                'colors':[['<bandkey1>-<bandkey2>',
                                           '<BAND1> - <BAND2>'],
                                          ['<bandkey3>-<bandkey4>',
                                           '<BAND3> - <BAND4>']]},
            .
            ...
            .
            '<bandpass_key_N>':{'dustkey':'<twomass_dust_key_N>',
                                'label':'<band_label_N>'
                                'colors':[['<bandkey1>-<bandkey2>',
                                           '<BAND1> - <BAND2>'],
                                          ['<bandkey3>-<bandkey4>',
                                           '<BAND3> - <BAND4>']]},
            }

        Where:

        `bandpass_key` is a key to use to refer to this bandpass in the
        `objectinfo` dict, e.g. 'sdssg' for SDSS g band

        `twomass_dust_key` is the key to use in the 2MASS DUST result table for
        reddening per band-pass. For example, given the following DUST result
        table (using http://irsa.ipac.caltech.edu/applications/DUST/)::

            |Filter_name|LamEff |A_over_E_B_V_SandF|A_SandF|A_over_E_B_V_SFD|A_SFD|
            |char       |float  |float             |float  |float           |float|
            |           |microns|                  |mags   |                |mags |
             CTIO U       0.3734              4.107   0.209            4.968 0.253
             CTIO B       0.4309              3.641   0.186            4.325 0.221
             CTIO V       0.5517              2.682   0.137            3.240 0.165
            .
            .
            ...

        The `twomass_dust_key` for 'vmag' would be 'CTIO V'. If you want to
        skip DUST lookup and want to pass in a specific reddening magnitude
        for your bandpass, use a float for the value of
        `twomass_dust_key`. If you want to skip DUST lookup entirely for
        this bandpass, use None for the value of `twomass_dust_key`.

        `band_label` is the label to use for this bandpass, e.g. 'W1' for
        WISE-1 band, 'u' for SDSS u, etc.

        The 'colors' list contains color definitions for all colors you want
        to generate using this bandpass. this list contains elements of the
        form::

            ['<bandkey1>-<bandkey2>','<BAND1> - <BAND2>']

        where the the first item is the bandpass keys making up this color,
        and the second item is the label for this color to be used by the
        frontends. An example::

            ['sdssu-sdssg','u - g']

    lcformat : str
        This is the `formatkey` associated with your light curve format, which
        you previously passed in to the `lcproc.register_lcformat`
        function. This will be used to look up how to find and read the light
        curves specified in `basedir` or `use_list_of_filenames`.

    lcformatdir : str or None
        If this is provided, gives the path to a directory when you've stored
        your lcformat description JSONs, other than the usual directories lcproc
        knows to search for them in. Use this along with `lcformat` to specify
        an LC format JSON file that's not currently registered with lcproc.

    Returns
    -------

    str
        Path to the output pickle containing all of the star features for this
        object.

    '''

    try:
        formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir)
        if formatinfo:
            (dfileglob, readerfunc, dtimecols, dmagcols, derrcols,
             magsarefluxes, normfunc) = formatinfo
        else:
            LOGERROR("can't figure out the light curve format")
            return None
    except Exception as e:
        LOGEXCEPTION("can't figure out the light curve format")
        return None

    try:

        # get the LC into a dict
        lcdict = readerfunc(lcfile)

        # this should handle lists/tuples being returned by readerfunc
        # we assume that the first element is the actual lcdict
        # FIXME: figure out how to not need this assumption
        if ((isinstance(lcdict, (list, tuple)))
                and (isinstance(lcdict[0], dict))):
            lcdict = lcdict[0]

        resultdict = {
            'objectid': lcdict['objectid'],
            'info': lcdict['objectinfo'],
            'lcfbasename': os.path.basename(lcfile)
        }

        # run the coord features first
        coordfeat = starfeatures.coord_features(lcdict['objectinfo'])

        # next, run the color features
        colorfeat = starfeatures.color_features(
            lcdict['objectinfo'],
            deredden=deredden,
            custom_bandpasses=custom_bandpasses)

        # run a rough color classification
        colorclass = starfeatures.color_classification(colorfeat, coordfeat)

        # finally, run the neighbor features
        nbrfeat = starfeatures.neighbor_gaia_features(lcdict['objectinfo'],
                                                      kdtree,
                                                      neighbor_radius_arcsec)

        # get the objectids of the neighbors found if any
        if nbrfeat['nbrindices'].size > 0:
            nbrfeat['nbrobjectids'] = objlist[nbrfeat['nbrindices']]
            nbrfeat['closestnbrobjectid'] = objlist[
                nbrfeat['closestdistnbrind']]
            nbrfeat['closestnbrlcfname'] = lcflist[
                nbrfeat['closestdistnbrind']]

        else:
            nbrfeat['nbrobjectids'] = np.array([])
            nbrfeat['closestnbrobjectid'] = np.array([])
            nbrfeat['closestnbrlcfname'] = np.array([])

        # update the result dict
        resultdict.update(coordfeat)
        resultdict.update(colorfeat)
        resultdict.update(colorclass)
        resultdict.update(nbrfeat)

        outfile = os.path.join(
            outdir, 'starfeatures-%s.pkl' %
            squeeze(resultdict['objectid']).replace(' ', '-'))

        with open(outfile, 'wb') as outfd:
            pickle.dump(resultdict, outfd, protocol=4)

        return outfile

    except Exception as e:

        LOGEXCEPTION('failed to get star features for %s because: %s' %
                     (os.path.basename(lcfile), e))
        return None
Ejemplo n.º 15
0
def parallel_starfeatures_lcdir(lcdir,
                                outdir,
                                lc_catalog_pickle,
                                neighbor_radius_arcsec,
                                fileglob=None,
                                maxobjects=None,
                                deredden=True,
                                custom_bandpasses=None,
                                lcformat='hat-sql',
                                lcformatdir=None,
                                nworkers=NCPUS,
                                recursive=True):
    '''This runs parallel star feature extraction for a directory of LCs.

    Parameters
    ----------

    lcdir : list of str
        The directory to search for light curves.

    outdir : str
        The output directory where the results will be placed.

    lc_catalog_pickle : str
        The path to a catalog containing at a dict with least:

        - an object ID array accessible with `dict['objects']['objectid']`

        - an LC filename array accessible with `dict['objects']['lcfname']`

        - a `scipy.spatial.KDTree` or `cKDTree` object to use for finding
          neighbors for each object accessible with `dict['kdtree']`

        A catalog pickle of the form needed can be produced using
        :py:func:`astrobase.lcproc.catalogs.make_lclist` or
        :py:func:`astrobase.lcproc.catalogs.filter_lclist`.

    neighbor_radius_arcsec : float
        This indicates the radius in arcsec to search for neighbors for this
        object using the light curve catalog's `kdtree`, `objlist`, `lcflist`,
        and in GAIA.

    fileglob : str
        The UNIX file glob to use to search for the light curves in `lcdir`. If
        None, the default value for the light curve format specified will be
        used.

    maxobjects : int
        The number of objects to process from `lclist`.

    deredden : bool
        This controls if the colors and any color classifications will be
        dereddened using 2MASS DUST.

    custom_bandpasses : dict or None
        This is a dict used to define any custom bandpasses in the
        `in_objectinfo` dict you want to make this function aware of and
        generate colors for. Use the format below for this dict::

            {
            '<bandpass_key_1>':{'dustkey':'<twomass_dust_key_1>',
                                'label':'<band_label_1>'
                                'colors':[['<bandkey1>-<bandkey2>',
                                           '<BAND1> - <BAND2>'],
                                          ['<bandkey3>-<bandkey4>',
                                           '<BAND3> - <BAND4>']]},
            .
            ...
            .
            '<bandpass_key_N>':{'dustkey':'<twomass_dust_key_N>',
                                'label':'<band_label_N>'
                                'colors':[['<bandkey1>-<bandkey2>',
                                           '<BAND1> - <BAND2>'],
                                          ['<bandkey3>-<bandkey4>',
                                           '<BAND3> - <BAND4>']]},
            }

        Where:

        `bandpass_key` is a key to use to refer to this bandpass in the
        `objectinfo` dict, e.g. 'sdssg' for SDSS g band

        `twomass_dust_key` is the key to use in the 2MASS DUST result table for
        reddening per band-pass. For example, given the following DUST result
        table (using http://irsa.ipac.caltech.edu/applications/DUST/)::

            |Filter_name|LamEff |A_over_E_B_V_SandF|A_SandF|A_over_E_B_V_SFD|A_SFD|
            |char       |float  |float             |float  |float           |float|
            |           |microns|                  |mags   |                |mags |
             CTIO U       0.3734              4.107   0.209            4.968 0.253
             CTIO B       0.4309              3.641   0.186            4.325 0.221
             CTIO V       0.5517              2.682   0.137            3.240 0.165
            .
            .
            ...

        The `twomass_dust_key` for 'vmag' would be 'CTIO V'. If you want to
        skip DUST lookup and want to pass in a specific reddening magnitude
        for your bandpass, use a float for the value of
        `twomass_dust_key`. If you want to skip DUST lookup entirely for
        this bandpass, use None for the value of `twomass_dust_key`.

        `band_label` is the label to use for this bandpass, e.g. 'W1' for
        WISE-1 band, 'u' for SDSS u, etc.

        The 'colors' list contains color definitions for all colors you want
        to generate using this bandpass. this list contains elements of the
        form::

            ['<bandkey1>-<bandkey2>','<BAND1> - <BAND2>']

        where the the first item is the bandpass keys making up this color,
        and the second item is the label for this color to be used by the
        frontends. An example::

            ['sdssu-sdssg','u - g']

    lcformat : str
        This is the `formatkey` associated with your light curve format, which
        you previously passed in to the `lcproc.register_lcformat`
        function. This will be used to look up how to find and read the light
        curves specified in `basedir` or `use_list_of_filenames`.

    lcformatdir : str or None
        If this is provided, gives the path to a directory when you've stored
        your lcformat description JSONs, other than the usual directories lcproc
        knows to search for them in. Use this along with `lcformat` to specify
        an LC format JSON file that's not currently registered with lcproc.

    nworkers : int
        The number of parallel workers to launch.

    Returns
    -------

    dict
        A dict with key:val pairs of the input light curve filename and the
        output star features pickle for each LC processed.

    '''

    try:
        formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir)
        if formatinfo:
            (dfileglob, readerfunc, dtimecols, dmagcols, derrcols,
             magsarefluxes, normfunc) = formatinfo
        else:
            LOGERROR("can't figure out the light curve format")
            return None
    except Exception as e:
        LOGEXCEPTION("can't figure out the light curve format")
        return None

    if not fileglob:
        fileglob = dfileglob

    # now find the files
    LOGINFO('searching for %s light curves in %s ...' % (lcformat, lcdir))

    if recursive is False:
        matching = glob.glob(os.path.join(lcdir, fileglob))

    else:
        # use recursive glob for Python 3.5+
        if sys.version_info[:2] > (3, 4):

            matching = glob.glob(os.path.join(lcdir, '**', fileglob),
                                 recursive=True)

        # otherwise, use os.walk and glob
        else:

            # use os.walk to go through the directories
            walker = os.walk(lcdir)
            matching = []

            for root, dirs, _files in walker:
                for sdir in dirs:
                    searchpath = os.path.join(root, sdir, fileglob)
                    foundfiles = glob.glob(searchpath)

                    if foundfiles:
                        matching.extend(foundfiles)

    # now that we have all the files, process them
    if matching and len(matching) > 0:

        LOGINFO('found %s light curves, getting starfeatures...' %
                len(matching))

        return parallel_starfeatures(matching,
                                     outdir,
                                     lc_catalog_pickle,
                                     neighbor_radius_arcsec,
                                     deredden=deredden,
                                     custom_bandpasses=custom_bandpasses,
                                     maxobjects=maxobjects,
                                     lcformat=lcformat,
                                     lcformatdir=lcformatdir,
                                     nworkers=nworkers)

    else:

        LOGERROR('no light curve files in %s format found in %s' %
                 (lcformat, lcdir))
        return None
Ejemplo n.º 16
0
def parallel_starfeatures(lclist,
                          outdir,
                          lc_catalog_pickle,
                          neighbor_radius_arcsec,
                          maxobjects=None,
                          deredden=True,
                          custom_bandpasses=None,
                          lcformat='hat-sql',
                          lcformatdir=None,
                          nworkers=NCPUS):
    '''This runs `get_starfeatures` in parallel for all light curves in `lclist`.

    Parameters
    ----------

    lclist : list of str
        The list of light curve file names to process.

    outdir : str
        The output directory where the results will be placed.

    lc_catalog_pickle : str
        The path to a catalog containing at a dict with least:

        - an object ID array accessible with `dict['objects']['objectid']`

        - an LC filename array accessible with `dict['objects']['lcfname']`

        - a `scipy.spatial.KDTree` or `cKDTree` object to use for finding
          neighbors for each object accessible with `dict['kdtree']`

        A catalog pickle of the form needed can be produced using
        :py:func:`astrobase.lcproc.catalogs.make_lclist` or
        :py:func:`astrobase.lcproc.catalogs.filter_lclist`.

    neighbor_radius_arcsec : float
        This indicates the radius in arcsec to search for neighbors for this
        object using the light curve catalog's `kdtree`, `objlist`, `lcflist`,
        and in GAIA.

    maxobjects : int
        The number of objects to process from `lclist`.

    deredden : bool
        This controls if the colors and any color classifications will be
        dereddened using 2MASS DUST.

    custom_bandpasses : dict or None
        This is a dict used to define any custom bandpasses in the
        `in_objectinfo` dict you want to make this function aware of and
        generate colors for. Use the format below for this dict::

            {
            '<bandpass_key_1>':{'dustkey':'<twomass_dust_key_1>',
                                'label':'<band_label_1>'
                                'colors':[['<bandkey1>-<bandkey2>',
                                           '<BAND1> - <BAND2>'],
                                          ['<bandkey3>-<bandkey4>',
                                           '<BAND3> - <BAND4>']]},
            .
            ...
            .
            '<bandpass_key_N>':{'dustkey':'<twomass_dust_key_N>',
                                'label':'<band_label_N>'
                                'colors':[['<bandkey1>-<bandkey2>',
                                           '<BAND1> - <BAND2>'],
                                          ['<bandkey3>-<bandkey4>',
                                           '<BAND3> - <BAND4>']]},
            }

        Where:

        `bandpass_key` is a key to use to refer to this bandpass in the
        `objectinfo` dict, e.g. 'sdssg' for SDSS g band

        `twomass_dust_key` is the key to use in the 2MASS DUST result table for
        reddening per band-pass. For example, given the following DUST result
        table (using http://irsa.ipac.caltech.edu/applications/DUST/)::

            |Filter_name|LamEff |A_over_E_B_V_SandF|A_SandF|A_over_E_B_V_SFD|A_SFD|
            |char       |float  |float             |float  |float           |float|
            |           |microns|                  |mags   |                |mags |
             CTIO U       0.3734              4.107   0.209            4.968 0.253
             CTIO B       0.4309              3.641   0.186            4.325 0.221
             CTIO V       0.5517              2.682   0.137            3.240 0.165
            .
            .
            ...

        The `twomass_dust_key` for 'vmag' would be 'CTIO V'. If you want to
        skip DUST lookup and want to pass in a specific reddening magnitude
        for your bandpass, use a float for the value of
        `twomass_dust_key`. If you want to skip DUST lookup entirely for
        this bandpass, use None for the value of `twomass_dust_key`.

        `band_label` is the label to use for this bandpass, e.g. 'W1' for
        WISE-1 band, 'u' for SDSS u, etc.

        The 'colors' list contains color definitions for all colors you want
        to generate using this bandpass. this list contains elements of the
        form::

            ['<bandkey1>-<bandkey2>','<BAND1> - <BAND2>']

        where the the first item is the bandpass keys making up this color,
        and the second item is the label for this color to be used by the
        frontends. An example::

            ['sdssu-sdssg','u - g']

    lcformat : str
        This is the `formatkey` associated with your light curve format, which
        you previously passed in to the `lcproc.register_lcformat`
        function. This will be used to look up how to find and read the light
        curves specified in `basedir` or `use_list_of_filenames`.

    lcformatdir : str or None
        If this is provided, gives the path to a directory when you've stored
        your lcformat description JSONs, other than the usual directories lcproc
        knows to search for them in. Use this along with `lcformat` to specify
        an LC format JSON file that's not currently registered with lcproc.

    nworkers : int
        The number of parallel workers to launch.

    Returns
    -------

    dict
        A dict with key:val pairs of the input light curve filename and the
        output star features pickle for each LC processed.

    '''

    try:
        formatinfo = get_lcformat(lcformat, use_lcformat_dir=lcformatdir)
        if formatinfo:
            (dfileglob, readerfunc, dtimecols, dmagcols, derrcols,
             magsarefluxes, normfunc) = formatinfo
        else:
            LOGERROR("can't figure out the light curve format")
            return None
    except Exception as e:
        LOGEXCEPTION("can't figure out the light curve format")
        return None

    # make sure to make the output directory if it doesn't exist
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    if maxobjects:
        lclist = lclist[:maxobjects]

    # read in the kdtree pickle
    with open(lc_catalog_pickle, 'rb') as infd:
        kdt_dict = pickle.load(infd)

    kdt = kdt_dict['kdtree']
    objlist = kdt_dict['objects']['objectid']
    objlcfl = kdt_dict['objects']['lcfname']

    tasks = [(x, outdir, kdt, objlist, objlcfl, neighbor_radius_arcsec,
              deredden, custom_bandpasses, lcformat) for x in lclist]

    with ProcessPoolExecutor(max_workers=nworkers) as executor:
        resultfutures = executor.map(_starfeatures_worker, tasks)

    results = [x for x in resultfutures]
    resdict = {os.path.basename(x): y for (x, y) in zip(lclist, results)}

    return resdict