Ejemplo n.º 1
0
 def __init__(self, headername, liststations, filename, outputdir):
     self.filename = filename
     self.outputdir = outputdir
     aa = read_ukmo(headername, liststations, filename)
     # unique IDs
     ids = npunique(aa.csvdata['ID'])
     for identifier in ids:
         try:
             lon = aa.stationdata[identifier.strip()]['longitude']
             lat = aa.stationdata[identifier.strip()]['latitude']
             elevation = aa.stationdata[identifier.strip()]['elevation']
         except KeyError:
             continue
         if (lon == -99999) or (lat == -99999) or (elevation == -99999):
             continue
         # list of indices
         idx = npwhere(aa.csvdata['ID'] == identifier)[0]
         # extract all keys for selected station identifier
         dataout = collections.defaultdict(list)
         dataout = dict(
             (k, nparray(aa.csvdata[k])[idx]) for k in aa.csvdata.keys())
         stationid = dataout['ID'][0]
         # remove variables from dictionary
         dataout.pop('longitude', None)
         dataout.pop('latitude', None)
         dataout.pop('elevation', None)
         dataout.pop('ID', None)
         # create netcdf file
         filename = self.define_output_file(stationid)
         self.write_netcdf(filename, dataout, lon, lat, elevation)
Ejemplo n.º 2
0
def GetProbTot(H, G, Reads, error_rate, plog = False, Qscores = None, usecounts=False):
	"""Determine the set of distinct extensions, calculate their prior weights and report the read-probabilities conditional on each extension. In case usecounts is True, also report the minimum number of reads compatible with each homologue, so that the upstream functions may set a threshold on the minimum number of reads compatible with each homologue."""
	perm = makePermutation(G) # distinct permutations of a genoptype
	probs = []   # the probability of SR(s) conditional on (Hp, H, eps)
	weights = [] # the prior pobability of Hp conditional on (H, eps)  
	Uniques = [] # distinct Hp's
	Mins = []
	for P in perm:
		Hp = H + P
		if Hp not in Uniques:
			Uniques.append(Hp)
			if usecounts:
				_prob, _counts = GetProbReads(Reads, Hp, error_rate, plog, Qscores, True)
				probs.append(_prob)
				Mins.append(min(_counts))
			else:
				probs.append(GetProbReads(Reads, Hp, error_rate, plog, Qscores))
			_npVset = []
			for _v in Hp.GetVS():
				_npv = array(_v)
				_npVset.append(npdel(_npv, npwhere(_npv=='-')).tolist())
			try:
				weights.append(exp(GetLogProbH(Haplotypes(1, 2, 1, log(len(set(itertools.permutations(tuple((
				_v[-2],_v[-1]) for _v in _npVset))))), None, None, *[(_v[-2],_v[-1]) for _v in _npVset]))))
			except IndexError:
				weights.append(1)
			#weights.append(exp(GetLogProbH(Hp)))                     
			#weights.append(1)
		else:
			pass
	wsum = float(sum(weights))
	weights = [_w/wsum for _w in weights]
	return Uniques, weights, probs, Mins
Ejemplo n.º 3
0
def add_uniprot_descriptors(df, debug):
    '''
    Adds uniprot category flags from preprocessed uniprot feature set
    Can handle holder since it just relies on residue having a uniprot position
    '''
    current_unp = df.iloc[0].uniprot
    global UNP_DF
    if UNP_DF is None:
        UNP_DF = load_uniprot_df() #TODO: move this to do at the beginning in main
            
    debug_head = "DEBUG: descriptors: add_uniprot_descriptors: "
    if debug:
        print debug_head+"Adding uniprot category descriptors"
    unpdf = UNP_DF[UNP_DF.uniprot==current_unp].drop(['uniprot'],axis=1)
    if unpdf.empty: # Probably don't need to do this as nulls are filled in at the end
        #No features assigned to this uniprot, need to use holders
        if debug:
            print debug_head+"No uniprot features found for uniprot {}, using holders".format(current_unp)
        holderdict = {'uniprot_position': list(df.uniprot_position.unique())}
        totalres = len(holderdict['uniprot_position'])
        for h in HEADERS['unp']:
            holderdict[h] = [HOLDERS[h]]*totalres
        unpdf = pd.DataFrame.from_dict(holderdict)
    elif debug:
        print debug_head+"Uniprot features found for {} residues in {}".format(len(unpdf.index),current_unp)
    ndf = pd.merge(df,unpdf,how="left",on='uniprot_position')
    for h in HEADERS['unp']:
        ndf[h] = npwhere(ndf[h].isnull(),HOLDERS[h],ndf[h])
    return ndf.drop_duplicates()
Ejemplo n.º 4
0
def find_carryover_indexes(high_cup_name, low_cup_name, analysis_cups):
    """
    Finds the indexes for the carryover samples, has to be slightly different to accomodate both
    :param high_cup_name:
    :param low_cup_name:
    :param analysis_cups:
    :return:
    """
    # TODO: could get rid of this function and just use find_cup_indexes twice for both of the carryover samples...
    a_c = nparray(analysis_cups)

    high_index = npwhere(a_c == high_cup_name)
    low_indexes = npwhere(a_c == low_cup_name)

    clean_high = [x for x in high_index[0]]
    clean_low = [x for x in low_indexes[0]]

    return clean_high, clean_low
 def get_start_stop_interval_idxs(timestamps_, new_timestamps_):
     """
     returns the new_timestamps start and stop indexes inside the timestamps interval
     """
     nts = npfromiter(new_timestamps_, npfloat)
     nts = npwhere((nts >= timestamps_[0])
                   & (nts <= timestamps_[-1]))[0]
     if nts.size != 0:
         return nts[0], nts[-1]
     else:
         return None, None
Ejemplo n.º 6
0
def get_package_table_entry(pkg_name, db_table):
    """Returns an astropy.table.Row object with the package data"""

    # ::todo implement multiple entry handling
    if pkg_name in db_table["name"]:
        pkg_index = npwhere(db_table["name"] == pkg_name)[0]
        pkg_entry = db_table[pkg_index[0]]
    else:
        pkg_entry = None

    return pkg_entry
Ejemplo n.º 7
0
def find_cup_indexes(specified_cup, analysis_cups):
    """
    Finds the indexes of a cup type of interest, e.g. indexes of DRIF returns peak indexes for Drift samples
    :param specified_cup:
    :param analysis_cups:
    :return: clean_indexes
    """
    a_c = nparray(analysis_cups)

    indexes = npwhere(a_c == specified_cup)

    clean_indexes = [int(x) for x in indexes[0]]
    return clean_indexes
Ejemplo n.º 8
0
    def attach_descriptors(self, partner):
        '''
        Attach descriptors to partner df
        Requires a partner df
        Does this by merge (chain/res#/icode) if not a holder
        If holder, attaches using bind
        returns the merged/binded result
        '''
        if self.debug:
            print self.debug_head+"Attaching descriptors to df with {} rows".format(len(partner.index))
        if self.descriptors is None:
            return partner
        if self.id == "Holder":
            # Generate a holder descriptor table
            desc_dict = dict()
            nrow = len(partner.index)
            for d in self.header:
                desc_dict[d] = [HOLDERS[d]]*nrow
            holder_desc = pd.DataFrame.from_dict(desc_dict)
            holder_desc.set_index(partner.index)
            if self.debug:
                print self.debug_head+"Concatenating holder descriptors"
            return pd.concat([partner,holder_desc[self.header]],axis=1)          
        else:
#            if self.filename=="url":
#                self.descriptors[self.header].to_csv("test1")
#                partner.to_csv("test2")
#                print self.res_header
#                print self.descriptors[self.res_header]
#                print partner[['structure','icode','chain']]
#                with open("tmp","a") as outfile:
#                    outfile.write("{}\ndf:{} = {}\npartner:{} = {}\n".format(self.id,list(self.descriptors[self.header]),self.descriptors[self.header].dtypes,list#(partner),partner.dtypes))
#            print partner.merge(self.descriptors[self.header],how='left',on=self.res_header)
#            print self.descriptors[self.header]
#            print partner
            newdf = partner.merge(self.descriptors[self.header],
                                 how='left',
                                 on=self.res_header)
            # Fill in any residues that were missing descriptors
            for x in self.header:
                if x in self.res_header: continue
                newdf[x] = npwhere(newdf[x].isnull(),HOLDERS[x],newdf[x])
            return newdf
Ejemplo n.º 9
0
def _metadata_velox(path):  # parameterized by path rather than emd_obj so that hashing lru hashing resolves easily

    metaData = {}
    metaData['veloxFlag'] = True

    metaData['FileName'] = path

    emd_obj = emdVelox.fileEMDVelox(path)
    dataGroup = emd_obj.list_data[0]
    dataset0 = dataGroup['Data']

    # Convert JSON metadata to dict
    mData = emd_obj.list_data[0]['Metadata'][:, 0]
    validMetaDataIndex = npwhere(mData > 0)  # find valid metadata
    mData = mData[validMetaDataIndex].tostring()  # change to string
    mDataS = json.loads(mData.decode('utf-8', 'ignore'))  # load UTF-8 string as JSON and output dict
    try:
        # Store the X and Y pixel size, offset and unit
        metaData['PhysicalSizeX'] = float(mDataS['BinaryResult']['PixelSize']['width'])
        metaData['PhysicalSizeXOrigin'] = float(mDataS['BinaryResult']['Offset']['x'])
        metaData['PhysicalSizeXUnit'] = mDataS['BinaryResult']['PixelUnitX']
        metaData['PhysicalSizeY'] = float(mDataS['BinaryResult']['PixelSize']['height'])
        metaData['PhysicalSizeYOrigin'] = float(mDataS['BinaryResult']['Offset']['y'])
        metaData['PhysicalSizeYUnit'] = mDataS['BinaryResult']['PixelUnitY']
    except:
        metaData['PhysicalSizeX'] = 1
        metaData['PhysicalSizeXOrigin'] = 0
        metaData['PhysicalSizeXUnit'] = ''
        metaData['PhysicalSizeY'] = 1
        metaData['PhysicalSizeYOrigin'] = 0
        metaData['PhysicalSizeYUnit'] = ''

    metaData.update(mDataS)

    metaData['shape'] = dataset0.shape

    return metaData
Ejemplo n.º 10
0
def legendre_fit_magseries(times,
                           mags,
                           errs,
                           period,
                           legendredeg=10,
                           sigclip=30.0,
                           plotfit=False,
                           magsarefluxes=False,
                           verbose=True):
    '''
    Fit an arbitrary-order Legendre series, via least squares, to the
    magnitude/flux time series. This is a series of the form:

        p(x) = c_0*L_0(x) + c_1*L_1(x) + c_2*L_2(x) + ... + c_n*L_n(x)

    where L_i's are Legendre polynomials (also caleld "Legendre functions of
    the first kind") and c_i's are the coefficients being fit.

    Args:

    legendredeg (int): n in the above equation. (I.e., if you give n=5, you
    will get 6 coefficients). This number should be much less than the number
    of data points you are fitting.

    sigclip (float): number of standard deviations away from the mean of the
    magnitude time-series from which to "clip" data points.

    magsarefluxes (bool): sets the ylabel and ylimits of plots for either
    magnitudes (False) or flux units (i.e. normalized to 1, in which case
    magsarefluxes should be set to True).

    Returns:

    returndict:
    {
        'fittype':'legendre',
        'fitinfo':{
            'legendredeg':legendredeg,
            'fitmags':fitmags,
            'fitepoch':magseriesepoch
        },
        'fitchisq':fitchisq,
        'fitredchisq':fitredchisq,
        'fitplotfile':None,
        'magseries':{
            'times':ptimes,
            'phase':phase,
            'mags':pmags,
            'errs':perrs,
            'magsarefluxes':magsarefluxes},
    }

    where `fitmags` is the values of the fit function interpolated onto
    magseries' `phase`.

    This function is mainly just a wrapper to
    numpy.polynomial.legendre.Legendre.fit.

    '''
    stimes, smags, serrs = sigclip_magseries(times,
                                             mags,
                                             errs,
                                             sigclip=sigclip,
                                             magsarefluxes=magsarefluxes)

    # get rid of zero errs
    nzind = npnonzero(serrs)
    stimes, smags, serrs = stimes[nzind], smags[nzind], serrs[nzind]

    phase, pmags, perrs, ptimes, mintime = (_get_phased_quantities(
        stimes, smags, serrs, period))

    if verbose:
        LOGINFO('fitting Legendre series with '
                'maximum Legendre polynomial order %s to '
                'mag series with %s observations, '
                'using period %.6f, folded at %.6f' %
                (legendredeg, len(pmags), period, mintime))

    # Least squares fit of Legendre polynomial series to the data. The window
    # and domain (see "Using the Convenience Classes" in the numpy
    # documentation) are handled automatically, scaling the times to a minimal
    # domain in [-1,1], in which Legendre polynomials are a complete basis.

    p = Legendre.fit(phase, pmags, legendredeg)
    coeffs = p.coef
    fitmags = p(phase)

    # Now compute the chisq and red-chisq.

    fitchisq = npsum(((fitmags - pmags) * (fitmags - pmags)) / (perrs * perrs))

    nparams = legendredeg + 1
    fitredchisq = fitchisq / (len(pmags) - nparams - 1)

    if verbose:
        LOGINFO('Legendre fit done. chisq = %.5f, reduced chisq = %.5f' %
                (fitchisq, fitredchisq))

    # figure out the time of light curve minimum (i.e. the fit epoch)
    # this is when the fit mag is maximum (i.e. the faintest)
    # or if magsarefluxes = True, then this is when fit flux is minimum
    if not magsarefluxes:
        fitmagminind = npwhere(fitmags == npmax(fitmags))
    else:
        fitmagminind = npwhere(fitmags == npmin(fitmags))
    magseriesepoch = ptimes[fitmagminind]

    # assemble the returndict
    returndict = {
        'fittype': 'legendre',
        'fitinfo': {
            'legendredeg': legendredeg,
            'fitmags': fitmags,
            'fitepoch': magseriesepoch,
            'finalparams': coeffs,
        },
        'fitchisq': fitchisq,
        'fitredchisq': fitredchisq,
        'fitplotfile': None,
        'magseries': {
            'times': ptimes,
            'phase': phase,
            'mags': pmags,
            'errs': perrs,
            'magsarefluxes': magsarefluxes
        }
    }

    # make the fit plot if required
    if plotfit and isinstance(plotfit, str):

        _make_fit_plot(phase,
                       pmags,
                       perrs,
                       fitmags,
                       period,
                       mintime,
                       magseriesepoch,
                       plotfit,
                       magsarefluxes=magsarefluxes)

        returndict['fitplotfile'] = plotfit

    return returndict
Ejemplo n.º 11
0
def savgol_fit_magseries(times,
                         mags,
                         errs,
                         period,
                         windowlength=None,
                         polydeg=2,
                         sigclip=30.0,
                         plotfit=False,
                         magsarefluxes=False,
                         verbose=True):
    '''
    Fit a Savitzky-Golay filter to the magnitude/flux time series.
    SG fits successive sub-sets (windows) of adjacent data points with a
    low-order polynomial via least squares. At each point (magnitude),
    it returns the value of the polynomial at that magnitude's time.
    This is made significantly cheaper than *actually* performing least squares
    for each window through linear algebra tricks that are possible when
    specifying the window size and polynomial order beforehand.
    Numerical Recipes Ch 14.8 gives an overview, Eq. 14.8.6 is what Scipy has
    implemented.

    The idea behind Savitzky-Golay is to preserve higher moments (>=2) of the
    input data series than would be done by a simple moving window average.

    Note that the filter assumes evenly spaced data, which magnitude time
    series are not. By *pretending* the data points are evenly spaced, we
    introduce an additional noise source in the function values. This is a
    relatively small noise source provided that the changes in the magnitude
    values across the full width of the N=windowlength point window is <
    sqrt(N/2) times the measurement noise on a single point.

    Args:

    windowlength (int): length of the filter window (the number of
    coefficients). Must be either positive and odd, or None. (The window is
    the number of points to the left, and to the right, of whatever point is
    having a polynomial fit to it locally). Bigger windows at fixed polynomial
    order risk lowering the amplitude of sharp features. If None, this routine
    (arbitrarily) sets the windowlength for phased LCs to be either the number
    of finite data points divided by 300, or polydeg+3, whichever is bigger.

    polydeg (int): the order of the polynomial used to fit the samples. Must
    be less than windowlength. "Higher-order filters do better at preserving
    feature heights and widths, but do less smoothing on broader features."
    (NumRec).

    magsarefluxes (bool): sets the ylabel and ylimits of plots for either
    magnitudes (False) or flux units (i.e. normalized to 1, in which case
    magsarefluxes should be set to True).

    '''
    stimes, smags, serrs = sigclip_magseries(times,
                                             mags,
                                             errs,
                                             sigclip=sigclip,
                                             magsarefluxes=magsarefluxes)

    # get rid of zero errs
    nzind = npnonzero(serrs)
    stimes, smags, serrs = stimes[nzind], smags[nzind], serrs[nzind]

    phase, pmags, perrs, ptimes, mintime = (_get_phased_quantities(
        stimes, smags, serrs, period))

    if not isinstance(windowlength, int):
        windowlength = max(polydeg + 3, int(len(phase) / 300))
        if windowlength % 2 == 0:
            windowlength += 1

    if verbose:
        LOGINFO('applying Savitzky-Golay filter with '
                'window length %s and polynomial degree %s to '
                'mag series with %s observations, '
                'using period %.6f, folded at %.6f' %
                (windowlength, polydeg, len(pmags), period, mintime))

    # generate the function values obtained by applying the SG filter. The
    # "wrap" option is best for phase-folded LCs.
    sgf = savgol_filter(pmags, windowlength, polydeg, mode='wrap')

    # here the "fit" to the phases is the function produced by the
    # Savitzky-Golay filter. then compute the chisq and red-chisq.
    fitmags = sgf

    fitchisq = npsum(((fitmags - pmags) * (fitmags - pmags)) / (perrs * perrs))

    # TODO: quantify dof for SG filter.
    nparams = int(len(pmags) / windowlength) * polydeg
    fitredchisq = fitchisq / (len(pmags) - nparams - 1)
    fitredchisq = -99.

    if verbose:
        LOGINFO('SG filter applied. chisq = %.5f, reduced chisq = %.5f' %
                (fitchisq, fitredchisq))

    # figure out the time of light curve minimum (i.e. the fit epoch)
    # this is when the fit mag is maximum (i.e. the faintest)
    # or if magsarefluxes = True, then this is when fit flux is minimum
    if not magsarefluxes:
        fitmagminind = npwhere(fitmags == npmax(fitmags))
    else:
        fitmagminind = npwhere(fitmags == npmin(fitmags))
    magseriesepoch = ptimes[fitmagminind]

    # assemble the returndict
    returndict = {
        'fittype': 'savgol',
        'fitinfo': {
            'windowlength': windowlength,
            'polydeg': polydeg,
            'fitmags': fitmags,
            'fitepoch': magseriesepoch
        },
        'fitchisq': fitchisq,
        'fitredchisq': fitredchisq,
        'fitplotfile': None,
        'magseries': {
            'times': ptimes,
            'phase': phase,
            'mags': pmags,
            'errs': perrs,
            'magsarefluxes': magsarefluxes
        }
    }

    # make the fit plot if required
    if plotfit and isinstance(plotfit, str):

        _make_fit_plot(phase,
                       pmags,
                       perrs,
                       fitmags,
                       period,
                       mintime,
                       magseriesepoch,
                       plotfit,
                       magsarefluxes=magsarefluxes)

        returndict['fitplotfile'] = plotfit

    return returndict
Ejemplo n.º 12
0
def spline_fit_magseries(times,
                         mags,
                         errs,
                         period,
                         knotfraction=0.01,
                         maxknots=30,
                         sigclip=30.0,
                         plotfit=False,
                         ignoreinitfail=False,
                         magsarefluxes=False,
                         verbose=True):
    '''This fits a univariate cubic spline to the phased light curve.

    This fit may be better than the Fourier fit for sharply variable objects,
    like EBs, so can be used to distinguish them from other types of variables.

    The knot fraction is the number of internal knots to use for the spline. A
    value of 0.01 (or 1%) of the total number of non-nan observations appears to
    work quite well, without over-fitting. maxknots controls the maximum number
    of knots that will be allowed.

    magsarefluxes is a boolean value for setting the ylabel and ylimits of
    plots for either magnitudes (False) or flux units (i.e. normalized to 1, in
    which case magsarefluxes should be set to True).

    Returns the chisq of the fit, as well as the reduced chisq. FIXME: check
    this equation below to see if it's right.

    reduced_chisq = fit_chisq/(len(pmags) - len(knots) - 1)

    '''

    # this is required to fit the spline correctly
    if errs is None:
        errs = npfull_like(mags, 0.005)

    # sigclip the magnitude time series
    stimes, smags, serrs = sigclip_magseries(times,
                                             mags,
                                             errs,
                                             sigclip=sigclip,
                                             magsarefluxes=magsarefluxes)
    # get rid of zero errs
    nzind = npnonzero(serrs)
    stimes, smags, serrs = stimes[nzind], smags[nzind], serrs[nzind]

    # phase the mag series
    phase, pmags, perrs, ptimes, mintime = (_get_phased_quantities(
        stimes, smags, serrs, period))

    # now figure out the number of knots up to max knots (=100)
    nobs = len(phase)
    nknots = int(npfloor(knotfraction * nobs))
    nknots = maxknots if nknots > maxknots else nknots
    splineknots = nplinspace(phase[0] + 0.01, phase[-1] - 0.01, num=nknots)

    # generate and fit the spline
    spl = LSQUnivariateSpline(phase, pmags, t=splineknots, w=1.0 / perrs)

    # calculate the spline fit to the actual phases, the chisq and red-chisq
    fitmags = spl(phase)

    fitchisq = npsum(((fitmags - pmags) * (fitmags - pmags)) / (perrs * perrs))

    fitredchisq = fitchisq / (len(pmags) - nknots - 1)

    if verbose:
        LOGINFO('spline fit done. nknots = %s,  '
                'chisq = %.5f, reduced chisq = %.5f' %
                (nknots, fitchisq, fitredchisq))

    # figure out the time of light curve minimum (i.e. the fit epoch)
    # this is when the fit mag is maximum (i.e. the faintest)
    # or if magsarefluxes = True, then this is when fit flux is minimum
    if not magsarefluxes:
        fitmagminind = npwhere(fitmags == npmax(fitmags))
    else:
        fitmagminind = npwhere(fitmags == npmin(fitmags))
    magseriesepoch = ptimes[fitmagminind]

    # assemble the returndict
    returndict = {
        'fittype': 'spline',
        'fitinfo': {
            'nknots': nknots,
            'fitmags': fitmags,
            'fitepoch': magseriesepoch
        },
        'fitchisq': fitchisq,
        'fitredchisq': fitredchisq,
        'fitplotfile': None,
        'magseries': {
            'times': ptimes,
            'phase': phase,
            'mags': pmags,
            'errs': perrs,
            'magsarefluxes': magsarefluxes
        },
    }

    # make the fit plot if required
    if plotfit and isinstance(plotfit, str):

        _make_fit_plot(phase,
                       pmags,
                       perrs,
                       fitmags,
                       period,
                       mintime,
                       magseriesepoch,
                       plotfit,
                       magsarefluxes=magsarefluxes)

        returndict['fitplotfile'] = plotfit

    return returndict
Ejemplo n.º 13
0
def fourier_fit_magseries(times,
                          mags,
                          errs,
                          period,
                          fourierorder=None,
                          fourierparams=None,
                          sigclip=3.0,
                          magsarefluxes=False,
                          plotfit=False,
                          ignoreinitfail=True,
                          verbose=True):
    '''This fits a Fourier series to a magnitude time series.

    This uses an 8th-order Fourier series by default. This is good for light
    curves with many thousands of observations (HAT light curves have ~10k
    observations). Lower the order accordingly if you have fewer observations in
    your light curves to avoid over-fitting.

    Set the Fourier order by using either the fourierorder kwarg OR the
    fourierparams kwarg. If fourierorder is None, then fourierparams is a
    list of the form for fourier order = N:

    [fourier_amp1, fourier_amp2, fourier_amp3,...,fourier_ampN,
     fourier_phase1, fourier_phase2, fourier_phase3,...,fourier_phaseN]

    If both/neither are specified, the default Fourier order of 3 will be used.

    Returns the Fourier fit parameters, the minimum chisq and reduced
    chisq. Makes a plot for the fit to the mag series if plotfit is a string
    containing a filename to write the plot to.

    This folds the time series using the given period and at the first
    observation. Can optionally sigma-clip observations.

    if ignoreinitfail is True, ignores the initial failure to find a set of
    optimized Fourier parameters and proceeds to do a least-squares fit anyway.

    magsarefluxes is a boolean value for setting the ylabel and ylimits of
    plots for either magnitudes (False) or flux units (i.e. normalized to 1, in
    which case magsarefluxes should be set to True).

    '''

    stimes, smags, serrs = sigclip_magseries(times,
                                             mags,
                                             errs,
                                             sigclip=sigclip,
                                             magsarefluxes=magsarefluxes)

    # get rid of zero errs
    nzind = npnonzero(serrs)
    stimes, smags, serrs = stimes[nzind], smags[nzind], serrs[nzind]

    phase, pmags, perrs, ptimes, mintime = (_get_phased_quantities(
        stimes, smags, serrs, period))

    # get the fourier order either from the scalar order kwarg...
    if fourierorder and fourierorder > 0 and not fourierparams:

        fourieramps = [0.6] + [0.2] * (fourierorder - 1)
        fourierphas = [0.1] + [0.1] * (fourierorder - 1)
        fourierparams = fourieramps + fourierphas

    # or from the fully specified coeffs vector
    elif not fourierorder and fourierparams:

        fourierorder = int(len(fourierparams) / 2)

    else:
        LOGWARNING('specified both/neither Fourier order AND Fourier coeffs, '
                   'using default Fourier order of 3')
        fourierorder = 3
        fourieramps = [0.6] + [0.2] * (fourierorder - 1)
        fourierphas = [0.1] + [0.1] * (fourierorder - 1)
        fourierparams = fourieramps + fourierphas

    if verbose:
        LOGINFO('fitting Fourier series of order %s to '
                'mag series with %s observations, '
                'using period %.6f, folded at %.6f' %
                (fourierorder, len(phase), period, mintime))

    # initial minimize call to find global minimum in chi-sq
    initialfit = spminimize(_fourier_chisq,
                            fourierparams,
                            method='BFGS',
                            args=(phase, pmags, perrs))

    # make sure this initial fit succeeds before proceeding
    if initialfit.success or ignoreinitfail:

        if verbose:
            LOGINFO('initial fit done, refining...')

        leastsqparams = initialfit.x

        try:
            leastsqfit = spleastsq(_fourier_residual,
                                   leastsqparams,
                                   args=(phase, pmags))
        except Exception as e:
            leastsqfit = None

        # if the fit succeeded, then we can return the final parameters
        if leastsqfit and leastsqfit[-1] in (1, 2, 3, 4):

            finalparams = leastsqfit[0]

            # calculate the chisq and reduced chisq
            fitmags = _fourier_func(finalparams, phase, pmags)

            fitchisq = npsum(
                ((fitmags - pmags) * (fitmags - pmags)) / (perrs * perrs))

            fitredchisq = fitchisq / (len(pmags) - len(finalparams) - 1)

            if verbose:
                LOGINFO('final fit done. chisq = %.5f, reduced chisq = %.5f' %
                        (fitchisq, fitredchisq))

            # figure out the time of light curve minimum (i.e. the fit epoch)
            # this is when the fit mag is maximum (i.e. the faintest)
            # or if magsarefluxes = True, then this is when fit flux is minimum
            if not magsarefluxes:
                fitmagminind = npwhere(fitmags == npmax(fitmags))
            else:
                fitmagminind = npwhere(fitmags == npmin(fitmags))
            magseriesepoch = ptimes[fitmagminind]

            # assemble the returndict
            returndict = {
                'fittype': 'fourier',
                'fitinfo': {
                    'fourierorder': fourierorder,
                    'finalparams': finalparams,
                    'initialfit': initialfit,
                    'leastsqfit': leastsqfit,
                    'fitmags': fitmags,
                    'fitepoch': magseriesepoch
                },
                'fitchisq': fitchisq,
                'fitredchisq': fitredchisq,
                'fitplotfile': None,
                'magseries': {
                    'times': ptimes,
                    'phase': phase,
                    'mags': pmags,
                    'errs': perrs,
                    'magsarefluxes': magsarefluxes
                },
            }

            # make the fit plot if required
            if plotfit and isinstance(plotfit, str):

                _make_fit_plot(phase,
                               pmags,
                               perrs,
                               fitmags,
                               period,
                               mintime,
                               magseriesepoch,
                               plotfit,
                               magsarefluxes=magsarefluxes)

                returndict['fitplotfile'] = plotfit

            return returndict

        # if the leastsq fit did not succeed, return Nothing
        else:
            LOGERROR(
                'fourier-fit: least-squared fit to the light curve failed')
            return {
                'fittype': 'fourier',
                'fitinfo': {
                    'fourierorder': fourierorder,
                    'finalparams': None,
                    'initialfit': initialfit,
                    'leastsqfit': None,
                    'fitmags': None,
                    'fitepoch': None
                },
                'fitchisq': npnan,
                'fitredchisq': npnan,
                'fitplotfile': None,
                'magseries': {
                    'times': ptimes,
                    'phase': phase,
                    'mags': pmags,
                    'errs': perrs,
                    'magsarefluxes': magsarefluxes
                }
            }

    # if the fit didn't succeed, we can't proceed
    else:

        LOGERROR('initial Fourier fit did not succeed, '
                 'reason: %s, returning scipy OptimizeResult' %
                 initialfit.message)

        return {
            'fittype': 'fourier',
            'fitinfo': {
                'fourierorder': fourierorder,
                'finalparams': None,
                'initialfit': initialfit,
                'leastsqfit': None,
                'fitmags': None,
                'fitepoch': None
            },
            'fitchisq': npnan,
            'fitredchisq': npnan,
            'fitplotfile': None,
            'magseries': {
                'times': ptimes,
                'phase': phase,
                'mags': pmags,
                'errs': perrs,
                'magsarefluxes': magsarefluxes
            }
        }
Ejemplo n.º 14
0
def GetProbTotChild(Hpm, Hpf, Hc, Gc, Readsc, error, recombination_rate, plog = True, Qscoresc = None, GenoConstraint=True):
	"""Determine the set of distinct extensions of a child conditional on its extended parents, calculate their prior weights and report the read AND recombination support for each child extension. It is assumed that at least one of the parental genotypes is available at s.""" 
	global transmit_parent_m 
	global transmit_parent_f 
	global child_haploid
	global transmit_parent
	probs = []   # the probability of SR(s) conditional on (Hp, H, eps)
	weights = [] # the prior pobability of Hp conditional on (H, eps, recombination_rate)  
	Uniques = [] # distinct Hp's
	if GenoConstraint and set(Gc.GetGenes())=={'.'}:
		garbage = sys.stderr.write('WARNING: Child\'s genotype is missing at position {1:d}! Its phasing extension will be skipped at s={0:d}!\n'.format(Gc.GetS()+1, Gc.GetPos()))
		return Uniques, weights, probs
	biparental = False
	current_Gm = Hpm.GetGenotype(Hpm.GetStop()) # current maternal allele (after extension)
	current_Gf = Hpf.GetGenotype(Hpf.GetStop()) # current paternal allele
	last_Gc = Hc.GetGenotype(Hc.GetStop()) # last child allele (before extension)
	ploidy_m = len(current_Gm) # It is assumed that the first haploid part of the child homologues has maternal descent and the second haploid paternal 
	ploidy_f = len(current_Gf)
	ploidy_c = (ploidy_m+ploidy_f)//2
	last_determined_SNPpos_m = Hpm.GetStop()-1 # the last SNP position with determined alleles in the maternal haplotypes, the default value
	last_determined_SNPpos_f = Hpf.GetStop()-1 # the last SNP position with determined alleles in the paternal haplotypes, the default value
	if set(Gc.GetGenes())=={'.'}: #'-' stands for complete genotype missing, '.' stands for partial missing or null allele
		Gc = Genotype(Gc.GetS(), Gc.GetPos(), *['-' for _x in range(0, ploidy_c)])
	if not transmit_parent_m:
		transmit_parent_m = list(itertools.combinations(range(0, ploidy_m), ploidy_m//2)) # Initialize transmit_parent_m 
	if not transmit_parent_f:
		transmit_parent_f = list(itertools.combinations(range(0, ploidy_f), ploidy_f//2)) # Initialize transmit_parent_f
	if not child_haploid:
		child_haploid = list(itertools.combinations(range(0, ploidy_c), ploidy_c//2)) # Initialize child_haploid
	if not transmit_parent:
		transmit_parent = list(itertools.product(transmit_parent_m, transmit_parent_f)) # Initialize transmit_parent  
	transmit_parent_G = []
	attempt_num=1
	OrigG = GenoConstraint
	while not transmit_parent_G and attempt_num<=2:
		if attempt_num>1:
                        GenoConstraint = False
                if '-' not in set(current_Gm).union(set(current_Gf)): # if no parental genotype is missing, consider all of the possible IBD potentiae
                        biparental = True
                        if GenoConstraint: # only allow transmissions that are compatible with the current genotype of the child 
                                transmit_parent_G = [_x for _x in transmit_parent if Counter([current_Gm[_y] for _y in _x[0]]+[current_Gf[_z] for _z in _x[1]])==Counter(Gc.GetGenes())]
                        else:
                                transmit_parent_G = [_x for _x in transmit_parent]
                elif attempt_num == 1: # Can only occur if GenoConstraint has been originally True 
                        if '-' in set(current_Gm): # impute maternal alleles from the child and the father
                                transmit_parent_G = [(tuple('-' for _i in range(0, ploidy_m//2)), _x) for _x in transmit_parent_f if any(Counter([current_Gf[_z] for _z in _x])==Counter([Gc.GetGenes()[_hh] for _hh in _h]) for _h in child_haploid)]
                        else:   # impute paternal alleles from the mother and the child
                                transmit_parent_G = [(_x, tuple('-' for _i in range(0, ploidy_f//2))) for _x in transmit_parent_m if any(Counter([current_Gm[_y] for _y in _x])==Counter([Gc.GetGenes()[_hh] for _hh in _h]) for _h in child_haploid)]
                else:
                        transmit_parent_G = []
		attempt_num+=1
        GenoConstraint = OrigG
        if attempt_num>2:
                error = min(0.999, error*1.1)
                recombination_rate = min(0.499, recombination_rate+0.005)
        if not transmit_parent_G:
                garbage = sys.stderr.write("WARNING: No extension of child haplotypes was compatible with the given parental and/or child genotypes assuming Mendelian inheritance! Child extension will be skipped at position {1:d}, s={0:d}!".format(Gc.GetS()+1, Gc.GetPos()))
                return Uniques, weights, probs
		#_return = GetProbTot(Hc, Gc, Readsc, error, True, Qscoresc)[:-1]
		#for _pro in range(0, len(_return[-1])):
		#	if plog:
		#		_return[-1][_pro]+=loge(0.1)
		#	else:
		#		_return[-1][_pro]*=0.1
		#for _sol in range(0, len(_return[0])):
		#	_return[0][_sol].SetMO(Hc.GetMO())
		#	_return[0][_sol].SetPO(Hc.GetPO())
		#return _return 
	for _t in transmit_parent_G:
		if biparental:
			Genotypes_to_pass = [[current_Gm[_y] for _y in _t[0]]+[current_Gf[_z] for _z in _t[1]]] # Genotype to pass will be the same as Gc.GetGenes() if GenoConstraint is True 
		elif '-' in set(current_Gm): # Try to impute the missing parent's alleles from the child alleles and the alleles of the other parent
                        fake_genotype_m = [_code for _code in Gc.GetGenes()] # If it is not possible, skip the extension of child at that position
                        try:
                                for _paternal_allele in (current_Gf[_z] for _z in _t[1]):
                                        fake_genotype_m.remove(_paternal_allele)
                                Genotypes_to_pass = [fake_genotype_m + [current_Gf[_z] for _z in _t[1]]] # Genotype to pass will be the same as Gc.GetGenes() if GenoConstraint is True. Only the order is changed so that the alleles with maternal descent come first. 
                        except ValueError as e:
                                garbage = sys.stderr.write("WARNING: "+e.args[0]+"\n")
                                if 'list.remove(x): x not in list' in e.args[0]:
                                        Genotypes_to_pass = []
                else:
                        fake_genotype_f = [_code for _code in Gc.GetGenes()]
                        try:
                                for _maternal_allele in (current_Gm[_y] for _y in _t[0]):
                                        fake_genotype_f.remove(_maternal_allele)
                                Genotypes_to_pass = [[current_Gm[_y] for _y in _t[0]] + fake_genotype_f] # Genotype to pass will be the same as Gc.GetGenes() if GenoConstraint is True 
                        except ValueError as e:
				garbage = sys.stderr.write("WARNING: "+e.args[0]+"\n")
                                if 'list.remove(x): x not in list' in e.args[0]:
                                        Genotypes_to_pass = []
			#print("$$$!\n", Genotypes_to_pass, fake_genotype_f,"$$$$$$$$$\n#############")
		for _Genotype_to_pass in Genotypes_to_pass: # Genotype_to_pass could have length >1 in case genoconstraint is False and a parental genotype is missing
			No_recom_info_m = bool(len(set(current_Gm))==1 or '-' in _t[0])
			No_recom_info_f = bool(len(set(current_Gf))==1 or '-' in _t[1])
			#print("Mother="+str(No_recom_info_m))
			#print("Father="+str(No_recom_info_f))
			P = Haplotypes(Gc.GetS(), Gc.GetS(), 0, loge(len(set(itertools.permutations(_Genotype_to_pass)))), None if No_recom_info_m else _t[0], None if No_recom_info_f else _t[1], *_Genotype_to_pass)
			#print("P="+str(P))
			#print(repr(P))
			Hpc = Hc + P
			#print("***********:\n",P, Hc, Hpc,'**************\n******************')
			_recombination_m = [] if (Hc.GetMO() is None or No_recom_info_m) else [0 if _origin == _new else 1 for _origin, _new in zip(Hc.GetMO(), _t[0])] # 0 if both the current allele and the preceding allele can be assigned to the same maternal descent, else 1. If the maternal descent if unknown, e.g. if the mother is homozygous, ignore maternal recombination. 
			_recombination_f = [] if (Hc.GetPO() is None or No_recom_info_f) else [0 if _origin == _new else 1 for _origin, _new in zip(Hc.GetPO(), _t[1])] # 0 if both the current allele and the preceding allele can be assigned to the same paternal descent, else 1. If the paternal descent if unknown, e.g. if the father is homozygous, ignore paternal recombination.
			#if plog:
			#	recombination_prob = sum([loge(recombination_rate) if _r==1 else loge(1-recombination_rate) for _r in (_recombination_m+_recombination_f)]) # Change the prior weights by penalizing recombination events with the cost -loge(recombination_rate)
			#	_prob_Hc_reads_recombination = _prob_Hc_reads + recombination_prob
			#else:
			#	recombination_prob = array([recombination_rate if _r==1 else (1-recombination_rate) for _r in (_recombination_m+_recombination_f)]).prod() # Change the prior weights by penalizing recombination events with the cost -loge(recombination_rate)
			#	_prob_Hc_reads_recombination = _prob_Hc_reads * recombination_prob
			recombination_weight = array([recombination_rate if _r==1 else (1-recombination_rate) for _r in (_recombination_m+_recombination_f)]).prod() # Change the prior weights by penalizing recombination events with the cost -loge(recombination_rate)
			#print(_recombination_m, _recombination_f, recombination_weight)
			#print("Hc:"+str(Hc.GetMO())+'\t'+str(Hc.GetPO()))
			#print("Hpc:"+str(Hpc.GetMO())+'\t'+str(Hpc.GetPO()))
			#print("T:"+str(_t[0])+'\t'+str(_t[1]))
			if Hpc not in Uniques:
				Uniques.append(Hpc)
				probs.append(GetProbReads(Readsc, Hpc, error, plog, Qscoresc))
				#probs.append(_prob_Hc_reads_recombination)
				#weights.append(1)
				_npVset = []
				for _v in Hpc.GetVS():
					_npv = array(_v)
					_npVset.append(npdel(_npv, npwhere(_npv=='-')).tolist())
				try:
					weights.append(exp(GetLogProbH(Haplotypes(1, 2, 1, log(len(set(itertools.permutations(tuple((_v[-2],_v[-1]) for _v in _npVset))))), None, None, *[(_v[-2],_v[-1]) for _v in _npVset]))))
				except IndexError:
					weights.append(1)
				#weights.append(exp(GetLogProbH(Hpc)))		
				weights[-1]*=recombination_weight
			elif _recombination_m or _recombination_f:
				#probs[Uniques.index(Hpc)] = max(probs[Uniques.index(Hpc)], _prob_Hc_reads_recombination)
				_npVset = []
				for _v in Hpc.GetVS():
					_npv = array(_v)
					_npVset.append(npdel(_npv, npwhere(_npv=='-')).tolist())
				try:
					new_weight = exp(GetLogProbH(Haplotypes(1, 2, 1, log(len(set(itertools.permutations(tuple((_v[-2],_v[-1]) for _v in _npVset))))), None, None, *[(_v[-2],_v[-1]) for _v in _npVset])))*recombination_weight
				except IndexError:
					new_weight = recombination_weight
				_Hpindx = Uniques.index(Hpc)
				if weights[_Hpindx] < new_weight:
					weights[_Hpindx] = new_weight
					if not No_recom_info_m:
						Uniques[_Hpindx].SetMO(Hpc.GetMO())
					if not No_recom_info_f:
						Uniques[_Hpindx].SetPO(Hpc.GetPO())
					#print("Updated Hpc:"+str(Hpc.GetMO())+'\t'+str(Hpc.GetPO()))
			else:
				pass
	wsum = float(sum(weights))
	weights = [_w/wsum for _w in weights]
	return Uniques, weights, probs
Ejemplo n.º 15
0
def process_vep(vep, debug):
    '''
    Takes the full VEP dataframe and selects a single
    consequence per variant, order of preference:
    1) Select canonical ENST when possible
    2) Select canonical RefSeq when possible
    3) Select ENST with unassigned canonical
    4) Select Refseq with unassigned canonical
    5) Select ENST specifically not canonical
    6) Select Refseq specifically not canonical
    7) Select XM entries left over
    8) Select entries with uniprot assignments only from VEP
    Anything without a Uniprot assignment is filtered
    In the event of a tie, take highest transcript
    ID if AA and pos are the same, otherwise take highest pos
    returns a list of unique variants in the form of
    the typical variant input file

    Note: In rare cases, the same variant will hit multiple
    primary uniprot entries. In these cases, uniprot has identified
    sufficiently different isoforms to call them different proteins
    Therefore, repeats across different primary uniprot entries may occur.
    
    A minor (major?) drawback to the current approach:
    If there is a variant which affects 2 different uniprot entries and in one
    entry it hits the canonical transcript, while in the other it hits a 
    non-canonical isoform, since the variant is filtered out before getting
    to the non-canonical isoforms, both wont be captured.
    One example is 1:g.156842168C>T which hits INSRR (one transcript) and NTRK1
    isoform 3. Only INSRR is kept since it has 1 transcript and isoform 3 is not
    canonical.

    Another drawback that might need to be addressed:
    As it stands, if it has multiple potentials in the same step,
    it selects one based on max protein position and if still multiple
    max transcript ID. The problem with this is that you can have 2 variants
    assigned different transcripts even though it would probably be easier and better
    if they both had the same one. For example, vars A and B don't hit canonical.
    Var A is position 10, Var B is position 100. There is isoform name ENST1 with 
    100 residues and ENST2 with 98 residues. The missing residues are at position
    11-12
    Var A gets ENST2 since it's 10 in both but ENST2>ENST1
    Var B gets ENST1 since it's at 100 in ENST1 and 98 in ENST2
    It's probably preferable to put them both in ENST1.
    However, the current datasets (and uniprot_sprot_human.tab) don't have transcript
    length. So, the best solution (to assign a transcript to a uniprot at a time before
    selecting with var, and resolving overlaps by using longest transcript can't be done
    without pulling in another dataset. I'm not sure how often this happens and if it
    warrants the extra time required for this better solution.
    One potential solution that would also solve another drawback is to read in the sequence
    sets to get the transcript lengths. This would allow cases (no idea how common or rare)
    where the variant hits a transcript in the fasta set but is assigned a different transcript
    that is not in the sequences set but is, for example, canonical according to uniprot. In this
    case, a transcript present in the sequence set could be preferred.
    '''
    debug_head = "DEBUG: IO: process_vep: "
    if debug:
        print debug_head + "processing raw df with {} rows".format(
            len(vep.index))
    # canonical identifies the canonical transcript used by uniprot
    canonical = load_canonical(debug)
    # sec2prime allows filtering out any secondary uniprot ACs
    sec2prime = load_sec2prime(debug)

    # Attach the uniprot names
    if debug:
        print debug_head + "merging uniprot names to transcripts"
    vep = vep.merge(canonical, how="left", on=["Transcript", "Protein"])

    try:
        if len(vep.index) == 0:
            raise ParseException("VEP file",
                                 "Failed to extract variants from VEP file")
    except ParseException as e:
        sys.exit(e.fullmsg)

    # Filter out anything that doesn't have a uniprot name
    # Keep those without a uniprot name for use in case they have one
    # assigned by VEP
    if debug:
        print debug_head + "extracting null uniprots into separate df"
    vep_nullunp = vep[vep.Uniprot.isnull()]
    vep = vep[vep.Uniprot.notnull()]
    nrows = len(vep.index)
    if debug:
        print debug_head + "result withunp {} rows; nullunp df {} rows".format(
            len(vep.index), len(vep_nullunp.index))

    # Filter any secondary uniprot AC's
    if debug:
        print debug_head + "Removing secondary uniprots"
    secondary = [x[0] for x in sec2prime]
    vep = vep[~vep.Uniprot.isin(secondary)]

    if debug:
        print debug_head + "Result df {} rows".format(len(vep.index))

    # Now select 1 consequence per variant
    # Note: There may be mulitple consequences if they map to
    # different uniprots (different proteins, same gene)
    def addvars(vep, vep_final,
                group):  #append to final set and filter from initial
        vep_final = pd.concat([
            vep_final,
            group.apply(lambda x: x.sort_values(["quickpos", "Transcript"],
                                                ascending=False).head(1))
        ])
        return vep[~vep.Varcode.isin(vep_final.Varcode)], vep_final

    vep_final = None

    #Need to add a quickposition column that is the int of the first
    # position for finding max position instance for repeats
    if debug:
        print debug_head + "Adding position column"
    vep["quickpos"] = vep.Protein_position.astype(str).str.extract(
        '(\d+)', expand=False).astype(int)
    vep_nullunp["quickpos"] = vep_nullunp.Protein_position.astype(
        str).str.extract('(\d+)', expand=False).astype(int)

    print "selecting unique variants"
    # Loops through steps 1-6:
    # Canonical = YES: ENST then NM then XM
    # Canonical = Unassigned: ENST then NM then XM
    # Canonical = NO: ENST then NM then XM

    # Typically, when a transcript is unassigned it means it is the only
    # one and therefore no isoform is designated. However, there is an edge
    # case where it is unassigned even though there are assignments
    # So, need to deal with these after and remove them for now
    if debug:
        print debug_head + "Separating unassigned transcripts"
    vep_unassigned = vep[(vep.Canonical=="Unassigned")\
                          & (pd.to_numeric(vep.nIsoforms)>0)]
    vep = vep[~((vep.Canonical=="Unassigned")\
                          & (pd.to_numeric(vep.nIsoforms)>0))]
    if debug:
        print debug_head + "Result: {} rows assigned; {} rows unassigned".format(
            len(vep.index), len(vep_unassigned.index))
    for outer in ["YES", "Unassigned", "NO"]:
        for inner in ["ENST", "NM", "XM"]:
            if len(vep.index) == 0: break
            if len(vep[(vep.Canonical==outer)\
                    & (vep.Transcript.str.startswith(inner))].index)==0:
                continue
            print "{}: {}".format(outer, inner)
            group = vep[(vep.Canonical==outer)\
                    & (vep.Transcript.str.startswith(inner))]\
                    .groupby(["Varcode","Uniprot"])
            if vep_final is None:
                vep_final = group.apply(lambda x: x.sort_values(
                    ["quickpos", "Transcript"], ascending=False).head(1))

                vep = vep[~vep.Varcode.isin(vep_final.Varcode)]
            else:
                vep, vep_final = addvars(vep, vep_final, group)
            if debug:
                print debug_head + "Selected {} rows; remaining {} rows".format(
                    len(vep_final.index), len(vep.index))
    # Deal with any of potential cases where an unassigned transcript is only hit
    # in a uniprot with multiple isoforms
    vep_unassigned = vep_unassigned[~vep_unassigned.Varcode.isin(vep_final.
                                                                 Varcode)]
    if debug:
        print debug_head + "processing remaining {} unassigned".format(
            len(vep_unassigned.index))
    for inner in ["ENST", "NM", "XM"]:
        if len(vep_unassigned.index) == 0: break
        group = vep_unassigned[vep_unassigned.Transcript.str.startswith(inner)]\
                                .groupby(["Varcode","Uniprot"])
        current_vars = group.apply(lambda x: x.sort_values(
            ["quickpos", "Transcript"], ascending=False).head(1))
        vep_final = pd.concat([vep_final, current_vars])
        vep_unassigned = vep_unassigned[~vep_unassigned.Varcode.isin(vep_final.
                                                                     Varcode)]
        if debug:
            print debug_head + "Selected {} rows; remaining unassigned {} rows".format(
                len(vep_final.index), len(vep_unassigned.index))

    # Step 8: select any remaining variants that could not
    # be assigned a uniprot entry based on the read in uniprot data.
    # Only consider instances where VEP provided a uniprot entry.
    # In some cases, the VEP uniprot may not match the current uniprot
    # which is why only those in the sequence datasets provided to this
    # program were considered first and anything left over the uniprot is
    # taken from VEP
    vep_nullunp = vep_nullunp[~vep_nullunp.Varcode.isin(vep_final.Varcode)]
    if debug:
        print debug_head + "Processing {} rows without uniprot".format(
            len(vep_nullunp.index))
    for outer in ["SWISSPROT", "TREMBL"]:
        for inner in ["ENST", "NM", "XM"]:
            if len(vep_nullunp.index) == 0: break
            print "{}: {}".format(outer, inner)
            group = vep_nullunp[(vep_nullunp[outer]!="-")\
                                & (vep_nullunp.Transcript.str.startswith(inner))]\
                                .groupby(["Varcode",outer])
            current_vars = group.apply(lambda x: x.sort_values(
                ["quickpos", "Transcript"], ascending=False).head(1))
            vep_final = pd.concat([vep_final, current_vars])
            vep_final["Uniprot"] = npwhere(vep_final.Uniprot.isnull(),
                                           vep_final[outer], vep_final.Uniprot)
            vep_final["Isoform"] = npwhere(vep_final.Isoform.isnull(),
                                           vep_final[outer], vep_final.Isoform)
            vep_nullunp = vep_nullunp[~vep_nullunp.Varcode.isin(vep_final.
                                                                Varcode)]
            if debug:
                print debug_head + "Selected {} rows: remaining nounp {} rows".format(
                    len(vep_final.index), len(vep_nullunp.index))
    # Finally, replace all "Unassigned" isoforms with the uniprot entry
    vep_final["Isoform"] = npwhere(vep_final.Isoform == "Unassigned",
                                   vep_final.Uniprot, vep_final.Isoform)
    print "{} unique variants with uniprot assignment selected".format(
        vep_final.Varcode.nunique())
    # report any variants that have multiple assignments
    if vep_final.Varcode.nunique() < vep_final.shape[0]:
        print "The following variant codes have multiple uniprot assignments and will be treated as individual variants for all pairs:"
        dups = vep_final[vep_final.duplicated(subset='Varcode', keep=False)]
        print vep_final[vep_final.Varcode.isin(
            dups.Varcode)][['Varcode', 'Uniprot']].to_string(index=False)
    vep = pd.concat([vep, vep_nullunp, vep_unassigned])
    if len(vep.index) > 0:
        print "The following {} unique coding variants were unable to be assigned:".format(
            vep.Varcode.nunique())
        print ",".join(set(vep.Varcode))
    vep_final.to_csv("outtmp.tab", sep="\t", header=True, index=False)
    return vep_final.drop_duplicates()
Ejemplo n.º 16
0
def spline_fit_magseries(times, mags, errs, period,
                         knotfraction=0.01,
                         maxknots=30,
                         sigclip=30.0,
                         plotfit=False,
                         ignoreinitfail=False,
                         magsarefluxes=False,
                         verbose=True):

    '''This fits a univariate cubic spline to the phased light curve.

    This fit may be better than the Fourier fit for sharply variable objects,
    like EBs, so can be used to distinguish them from other types of variables.

    Parameters
    ----------

    times,mags,errs : np.array
        The input mag/flux time-series to fit a spline to.

    period : float
        The period to use for the spline fit.

    knotfraction : float
        The knot fraction is the number of internal knots to use for the
        spline. A value of 0.01 (or 1%) of the total number of non-nan
        observations appears to work quite well, without over-fitting. maxknots
        controls the maximum number of knots that will be allowed.

    maxknots : int
        The maximum number of knots that will be used even if `knotfraction`
        gives a value to use larger than `maxknots`. This helps dealing with
        over-fitting to short time-scale variations.

    sigclip : float or int or sequence of two floats/ints or None
        If a single float or int, a symmetric sigma-clip will be performed using
        the number provided as the sigma-multiplier to cut out from the input
        time-series.

        If a list of two ints/floats is provided, the function will perform an
        'asymmetric' sigma-clip. The first element in this list is the sigma
        value to use for fainter flux/mag values; the second element in this
        list is the sigma value to use for brighter flux/mag values. For
        example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma
        dimmings and greater than 3-sigma brightenings. Here the meaning of
        "dimming" and "brightening" is set by *physics* (not the magnitude
        system), which is why the `magsarefluxes` kwarg must be correctly set.

        If `sigclip` is None, no sigma-clipping will be performed, and the
        time-series (with non-finite elems removed) will be passed through to
        the output.

    magsarefluxes : bool
        If True, will treat the input values of `mags` as fluxes for purposes of
        plotting the fit and sig-clipping.

    plotfit : str or False
        If this is a string, this function will make a plot for the fit to the
        mag/flux time-series and writes the plot to the path specified here.

    ignoreinitfail : bool
        If this is True, ignores the initial failure to find a set of optimized
        Fourier parameters using the global optimization function and proceeds
        to do a least-squares fit anyway.

    verbose : bool
        If True, will indicate progress and warn of any problems.

    Returns
    -------

    dict
        This function returns a dict containing the model fit parameters, the
        minimized chi-sq value and the reduced chi-sq value. The form of this
        dict is mostly standardized across all functions in this module::

            {
                'fittype':'spline',
                'fitinfo':{
                    'nknots': the number of knots used for the fit
                    'fitmags': the model fit mags,
                    'fitepoch': the epoch of minimum light for the fit,
                },
                'fitchisq': the minimized value of the fit's chi-sq,
                'fitredchisq':the reduced chi-sq value,
                'fitplotfile': the output fit plot if fitplot is not None,
                'magseries':{
                    'times':input times in phase order of the model,
                    'phase':the phases of the model mags,
                    'mags':input mags/fluxes in the phase order of the model,
                    'errs':errs in the phase order of the model,
                    'magsarefluxes':input value of magsarefluxes kwarg
                }
            }

    '''

    # this is required to fit the spline correctly
    if errs is None:
        errs = npfull_like(mags, 0.005)

    # sigclip the magnitude time series
    stimes, smags, serrs = sigclip_magseries(times, mags, errs,
                                             sigclip=sigclip,
                                             magsarefluxes=magsarefluxes)
    # get rid of zero errs
    nzind = npnonzero(serrs)
    stimes, smags, serrs = stimes[nzind], smags[nzind], serrs[nzind]

    # phase the mag series
    phase, pmags, perrs, ptimes, mintime = (
        get_phased_quantities(stimes, smags, serrs, period)
    )

    # now figure out the number of knots up to max knots (=100)
    nobs = len(phase)
    nknots = int(npfloor(knotfraction*nobs))
    nknots = maxknots if nknots > maxknots else nknots
    splineknots = nplinspace(phase[0] + 0.01,
                             phase[-1] - 0.01,
                             num=nknots)

    # NOTE: newer scipy needs x to be strictly increasing. this means we should
    # filter out anything that doesn't have np.diff(phase) > 0.0
    # FIXME: this needs to be tested
    phase_diffs_ind = npdiff(phase) > 0.0
    incphase_ind = npconcatenate((nparray([True]), phase_diffs_ind))
    phase, pmags, perrs = (phase[incphase_ind],
                           pmags[incphase_ind],
                           perrs[incphase_ind])

    # generate and fit the spline
    spl = LSQUnivariateSpline(phase, pmags, t=splineknots, w=1.0/perrs)

    # calculate the spline fit to the actual phases, the chisq and red-chisq
    fitmags = spl(phase)

    fitchisq = npsum(
        ((fitmags - pmags)*(fitmags - pmags)) / (perrs*perrs)
    )

    fitredchisq = fitchisq/(len(pmags) - nknots - 1)

    if verbose:
        LOGINFO(
            'spline fit done. nknots = %s,  '
            'chisq = %.5f, reduced chisq = %.5f' %
            (nknots, fitchisq, fitredchisq)
        )

    # figure out the time of light curve minimum (i.e. the fit epoch)
    # this is when the fit mag is maximum (i.e. the faintest)
    # or if magsarefluxes = True, then this is when fit flux is minimum
    if not magsarefluxes:
        fitmagminind = npwhere(fitmags == npmax(fitmags))
    else:
        fitmagminind = npwhere(fitmags == npmin(fitmags))
    if len(fitmagminind[0]) > 1:
        fitmagminind = (fitmagminind[0][0],)
    magseriesepoch = ptimes[fitmagminind]

    # assemble the returndict
    returndict = {
        'fittype':'spline',
        'fitinfo':{
            'nknots':nknots,
            'fitmags':fitmags,
            'fitepoch':magseriesepoch
        },
        'fitchisq':fitchisq,
        'fitredchisq':fitredchisq,
        'fitplotfile':None,
        'magseries':{
            'times':ptimes,
            'phase':phase,
            'mags':pmags,
            'errs':perrs,
            'magsarefluxes':magsarefluxes
        },
    }

    # make the fit plot if required
    if plotfit and isinstance(plotfit, str):

        make_fit_plot(phase, pmags, perrs, fitmags,
                      period, mintime, magseriesepoch,
                      plotfit,
                      magsarefluxes=magsarefluxes)

        returndict['fitplotfile'] = plotfit

    return returndict
Ejemplo n.º 17
0
def legendre_fit_magseries(times, mags, errs, period,
                           legendredeg=10,
                           sigclip=30.0,
                           plotfit=False,
                           magsarefluxes=False,
                           verbose=True):

    '''Fit an arbitrary-order Legendre series, via least squares, to the
    magnitude/flux time series.

    This is a series of the form::

        p(x) = c_0*L_0(x) + c_1*L_1(x) + c_2*L_2(x) + ... + c_n*L_n(x)

    where L_i's are Legendre polynomials (also called "Legendre functions of the
    first kind") and c_i's are the coefficients being fit.

    This function is mainly just a wrapper to
    `numpy.polynomial.legendre.Legendre.fit`.

    Parameters
    ----------

    times,mags,errs : np.array
        The input mag/flux time-series to fit a Legendre series polynomial to.

    period : float
        The period to use for the Legendre fit.

    legendredeg : int
        This is `n` in the equation above, e.g. if you give `n=5`, you will
        get 6 coefficients. This number should be much less than the number of
        data points you are fitting.

    sigclip : float or int or sequence of two floats/ints or None
        If a single float or int, a symmetric sigma-clip will be performed using
        the number provided as the sigma-multiplier to cut out from the input
        time-series.

        If a list of two ints/floats is provided, the function will perform an
        'asymmetric' sigma-clip. The first element in this list is the sigma
        value to use for fainter flux/mag values; the second element in this
        list is the sigma value to use for brighter flux/mag values. For
        example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma
        dimmings and greater than 3-sigma brightenings. Here the meaning of
        "dimming" and "brightening" is set by *physics* (not the magnitude
        system), which is why the `magsarefluxes` kwarg must be correctly set.

        If `sigclip` is None, no sigma-clipping will be performed, and the
        time-series (with non-finite elems removed) will be passed through to
        the output.

    magsarefluxes : bool
        If True, will treat the input values of `mags` as fluxes for purposes of
        plotting the fit and sig-clipping.

    plotfit : str or False
        If this is a string, this function will make a plot for the fit to the
        mag/flux time-series and writes the plot to the path specified here.

    ignoreinitfail : bool
        If this is True, ignores the initial failure to find a set of optimized
        Fourier parameters using the global optimization function and proceeds
        to do a least-squares fit anyway.

    verbose : bool
        If True, will indicate progress and warn of any problems.

    Returns
    -------

    dict
        This function returns a dict containing the model fit parameters, the
        minimized chi-sq value and the reduced chi-sq value. The form of this
        dict is mostly standardized across all functions in this module::

            {
                'fittype':'legendre',
                'fitinfo':{
                    'legendredeg': the Legendre polynomial degree used,
                    'fitmags': the model fit mags,
                    'fitepoch': the epoch of minimum light for the fit,
                },
                'fitchisq': the minimized value of the fit's chi-sq,
                'fitredchisq':the reduced chi-sq value,
                'fitplotfile': the output fit plot if fitplot is not None,
                'magseries':{
                    'times':input times in phase order of the model,
                    'phase':the phases of the model mags,
                    'mags':input mags/fluxes in the phase order of the model,
                    'errs':errs in the phase order of the model,
                    'magsarefluxes':input value of magsarefluxes kwarg
                }
            }


    '''
    stimes, smags, serrs = sigclip_magseries(times, mags, errs,
                                             sigclip=sigclip,
                                             magsarefluxes=magsarefluxes)

    # get rid of zero errs
    nzind = npnonzero(serrs)
    stimes, smags, serrs = stimes[nzind], smags[nzind], serrs[nzind]

    phase, pmags, perrs, ptimes, mintime = (
        get_phased_quantities(stimes, smags, serrs, period)
    )

    if verbose:
        LOGINFO('fitting Legendre series with '
                'maximum Legendre polynomial order %s to '
                'mag series with %s observations, '
                'using period %.6f, folded at %.6f' % (legendredeg,
                                                       len(pmags),
                                                       period,
                                                       mintime))

    # Least squares fit of Legendre polynomial series to the data. The window
    # and domain (see "Using the Convenience Classes" in the numpy
    # documentation) are handled automatically, scaling the times to a minimal
    # domain in [-1,1], in which Legendre polynomials are a complete basis.

    p = Legendre.fit(phase, pmags, legendredeg)
    coeffs = p.coef
    fitmags = p(phase)

    # Now compute the chisq and red-chisq.

    fitchisq = npsum(
        ((fitmags - pmags)*(fitmags - pmags)) / (perrs*perrs)
    )

    nparams = legendredeg + 1
    fitredchisq = fitchisq/(len(pmags) - nparams - 1)

    if verbose:
        LOGINFO(
            'Legendre fit done. chisq = %.5f, reduced chisq = %.5f' %
            (fitchisq, fitredchisq)
        )

    # figure out the time of light curve minimum (i.e. the fit epoch)
    # this is when the fit mag is maximum (i.e. the faintest)
    # or if magsarefluxes = True, then this is when fit flux is minimum
    if not magsarefluxes:
        fitmagminind = npwhere(fitmags == npmax(fitmags))
    else:
        fitmagminind = npwhere(fitmags == npmin(fitmags))
    if len(fitmagminind[0]) > 1:
        fitmagminind = (fitmagminind[0][0],)
    magseriesepoch = ptimes[fitmagminind]

    # assemble the returndict
    returndict = {
        'fittype':'legendre',
        'fitinfo':{
            'legendredeg':legendredeg,
            'fitmags':fitmags,
            'fitepoch':magseriesepoch,
            'finalparams':coeffs,
        },
        'fitchisq':fitchisq,
        'fitredchisq':fitredchisq,
        'fitplotfile':None,
        'magseries':{
            'times':ptimes,
            'phase':phase,
            'mags':pmags,
            'errs':perrs,
            'magsarefluxes':magsarefluxes
        }
    }

    # make the fit plot if required
    if plotfit and isinstance(plotfit, str):

        make_fit_plot(phase, pmags, perrs, fitmags,
                      period, mintime, magseriesepoch,
                      plotfit,
                      magsarefluxes=magsarefluxes)

        returndict['fitplotfile'] = plotfit

    return returndict
Ejemplo n.º 18
0
def savgol_fit_magseries(times, mags, errs, period,
                         windowlength=None,
                         polydeg=2,
                         sigclip=30.0,
                         plotfit=False,
                         magsarefluxes=False,
                         verbose=True):

    '''Fit a Savitzky-Golay filter to the magnitude/flux time series.

    SG fits successive sub-sets (windows) of adjacent data points with a
    low-order polynomial via least squares. At each point (magnitude), it
    returns the value of the polynomial at that magnitude's time.  This is made
    significantly cheaper than *actually* performing least squares for each
    window through linear algebra tricks that are possible when specifying the
    window size and polynomial order beforehand.  Numerical Recipes Ch 14.8
    gives an overview, Eq. 14.8.6 is what Scipy has implemented.

    The idea behind Savitzky-Golay is to preserve higher moments (>=2) of the
    input data series than would be done by a simple moving window average.

    Note that the filter assumes evenly spaced data, which magnitude time series
    are not. By *pretending* the data points are evenly spaced, we introduce an
    additional noise source in the function values. This is a relatively small
    noise source provided that the changes in the magnitude values across the
    full width of the N=windowlength point window is < sqrt(N/2) times the
    measurement noise on a single point.

    TODO:
    - Find correct dof for reduced chi squared in savgol_fit_magseries

    Parameters
    ----------

    times,mags,errs : np.array
        The input mag/flux time-series to fit the Savitsky-Golay model to.

    period : float
        The period to use for the model fit.

    windowlength : None or int
        The length of the filter window (the number of coefficients). Must be
        either positive and odd, or None. (The window is the number of points to
        the left, and to the right, of whatever point is having a polynomial fit
        to it locally). Bigger windows at fixed polynomial order risk lowering
        the amplitude of sharp features. If None, this routine (arbitrarily)
        sets the `windowlength` for phased LCs to be either the number of finite
        data points divided by 300, or polydeg+3, whichever is bigger.

    polydeg : int
        This is the order of the polynomial used to fit the samples.  Must be
        less than `windowlength`. "Higher-order filters do better at preserving
        feature heights and widths, but do less smoothing on broader features."
        (Numerical Recipes).

    sigclip : float or int or sequence of two floats/ints or None
        If a single float or int, a symmetric sigma-clip will be performed using
        the number provided as the sigma-multiplier to cut out from the input
        time-series.

        If a list of two ints/floats is provided, the function will perform an
        'asymmetric' sigma-clip. The first element in this list is the sigma
        value to use for fainter flux/mag values; the second element in this
        list is the sigma value to use for brighter flux/mag values. For
        example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma
        dimmings and greater than 3-sigma brightenings. Here the meaning of
        "dimming" and "brightening" is set by *physics* (not the magnitude
        system), which is why the `magsarefluxes` kwarg must be correctly set.

        If `sigclip` is None, no sigma-clipping will be performed, and the
        time-series (with non-finite elems removed) will be passed through to
        the output.

    magsarefluxes : bool
        If True, will treat the input values of `mags` as fluxes for purposes of
        plotting the fit and sig-clipping.

    plotfit : str or False
        If this is a string, this function will make a plot for the fit to the
        mag/flux time-series and writes the plot to the path specified here.

    ignoreinitfail : bool
        If this is True, ignores the initial failure to find a set of optimized
        Fourier parameters using the global optimization function and proceeds
        to do a least-squares fit anyway.

    verbose : bool
        If True, will indicate progress and warn of any problems.

    Returns
    -------

    dict
        This function returns a dict containing the model fit parameters, the
        minimized chi-sq value and the reduced chi-sq value. The form of this
        dict is mostly standardized across all functions in this module::

            {
                'fittype':'savgol',
                'fitinfo':{
                    'windowlength': the window length used for the fit,
                    'polydeg':the polynomial degree used for the fit,
                    'fitmags': the model fit mags,
                    'fitepoch': the epoch of minimum light for the fit,
                },
                'fitchisq': the minimized value of the fit's chi-sq,
                'fitredchisq':the reduced chi-sq value,
                'fitplotfile': the output fit plot if fitplot is not None,
                'magseries':{
                    'times':input times in phase order of the model,
                    'phase':the phases of the model mags,
                    'mags':input mags/fluxes in the phase order of the model,
                    'errs':errs in the phase order of the model,
                    'magsarefluxes':input value of magsarefluxes kwarg
                }
            }

    '''
    stimes, smags, serrs = sigclip_magseries(times, mags, errs,
                                             sigclip=sigclip,
                                             magsarefluxes=magsarefluxes)

    # get rid of zero errs
    nzind = npnonzero(serrs)
    stimes, smags, serrs = stimes[nzind], smags[nzind], serrs[nzind]

    phase, pmags, perrs, ptimes, mintime = (
        get_phased_quantities(stimes, smags, serrs, period)
    )

    if not isinstance(windowlength, int):
        windowlength = max(
            polydeg + 3,
            int(len(phase)/300)
        )
        if windowlength % 2 == 0:
            windowlength += 1

    if verbose:
        LOGINFO('applying Savitzky-Golay filter with '
                'window length %s and polynomial degree %s to '
                'mag series with %s observations, '
                'using period %.6f, folded at %.6f' % (windowlength,
                                                       polydeg,
                                                       len(pmags),
                                                       period,
                                                       mintime))

    # generate the function values obtained by applying the SG filter. The
    # "wrap" option is best for phase-folded LCs.
    sgf = savgol_filter(pmags, windowlength, polydeg, mode='wrap')

    # here the "fit" to the phases is the function produced by the
    # Savitzky-Golay filter. then compute the chisq and red-chisq.
    fitmags = sgf

    fitchisq = npsum(
        ((fitmags - pmags)*(fitmags - pmags)) / (perrs*perrs)
    )

    # TODO: quantify dof for SG filter.
    nparams = int(len(pmags)/windowlength) * polydeg
    fitredchisq = fitchisq/(len(pmags) - nparams - 1)
    fitredchisq = -99.

    if verbose:
        LOGINFO(
            'SG filter applied. chisq = %.5f, reduced chisq = %.5f' %
            (fitchisq, fitredchisq)
        )

    # figure out the time of light curve minimum (i.e. the fit epoch)
    # this is when the fit mag is maximum (i.e. the faintest)
    # or if magsarefluxes = True, then this is when fit flux is minimum
    if not magsarefluxes:
        fitmagminind = npwhere(fitmags == npmax(fitmags))
    else:
        fitmagminind = npwhere(fitmags == npmin(fitmags))
    if len(fitmagminind[0]) > 1:
        fitmagminind = (fitmagminind[0][0],)
    magseriesepoch = ptimes[fitmagminind]

    # assemble the returndict
    returndict = {
        'fittype':'savgol',
        'fitinfo':{
            'windowlength':windowlength,
            'polydeg':polydeg,
            'fitmags':fitmags,
            'fitepoch':magseriesepoch
        },
        'fitchisq':fitchisq,
        'fitredchisq':fitredchisq,
        'fitplotfile':None,
        'magseries':{
            'times':ptimes,
            'phase':phase,
            'mags':pmags,
            'errs':perrs,
            'magsarefluxes':magsarefluxes
        }
    }

    # make the fit plot if required
    if plotfit and isinstance(plotfit, str):

        make_fit_plot(phase, pmags, perrs, fitmags,
                      period, mintime, magseriesepoch,
                      plotfit,
                      magsarefluxes=magsarefluxes)

        returndict['fitplotfile'] = plotfit

    return returndict
Ejemplo n.º 19
0
def fourier_fit_magseries(
    times,
    mags,
    errs,
    period,
    fourierorder=None,
    fourierparams=None,
    fix_period=True,
    scale_errs_redchisq_unity=True,
    sigclip=3.0,
    magsarefluxes=False,
    plotfit=False,
    ignoreinitfail=True,
    verbose=True,
    curve_fit_kwargs=None,
):
    '''This fits a Fourier series to a mag/flux time series.

    Parameters
    ----------

    times,mags,errs : np.array
        The input mag/flux time-series to fit a Fourier cosine series to.

    period : float
        The period to use for the Fourier fit.

    fourierorder : None or int
        If this is an int, will be interpreted as the Fourier order of the
        series to fit to the input mag/flux times-series. If this is None and
        `fourierparams` is specified, `fourierparams` will be used directly to
        generate the fit Fourier series. If `fourierparams` is also None, this
        function will try to fit a Fourier cosine series of order 3 to the
        mag/flux time-series.

    fourierparams : list of floats or None
        If this is specified as a list of floats, it must be of the form below::

            [fourier_amp1, fourier_amp2, fourier_amp3,...,fourier_ampN,
             fourier_phase1, fourier_phase2, fourier_phase3,...,fourier_phaseN]

        to specify a Fourier cosine series of order N. If this is None and
        `fourierorder` is specified, the Fourier order specified there will be
        used to construct the Fourier cosine series used to fit the input
        mag/flux time-series. If both are None, this function will try to fit a
        Fourier cosine series of order 3 to the input mag/flux time-series.

    fix_period : bool
        If True, will fix the period with fitting the sinusoidal function to the
        phased light curve.

    scale_errs_redchisq_unity : bool
        If True, the standard errors on the fit parameters will be scaled to
        make the reduced chi-sq = 1.0. This sets the ``absolute_sigma`` kwarg
        for the ``scipy.optimize.curve_fit`` function to False.

    sigclip : float or int or sequence of two floats/ints or None
        If a single float or int, a symmetric sigma-clip will be performed using
        the number provided as the sigma-multiplier to cut out from the input
        time-series.

        If a list of two ints/floats is provided, the function will perform an
        'asymmetric' sigma-clip. The first element in this list is the sigma
        value to use for fainter flux/mag values; the second element in this
        list is the sigma value to use for brighter flux/mag values. For
        example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma
        dimmings and greater than 3-sigma brightenings. Here the meaning of
        "dimming" and "brightening" is set by *physics* (not the magnitude
        system), which is why the `magsarefluxes` kwarg must be correctly set.

        If `sigclip` is None, no sigma-clipping will be performed, and the
        time-series (with non-finite elems removed) will be passed through to
        the output.

    magsarefluxes : bool
        If True, will treat the input values of `mags` as fluxes for purposes of
        plotting the fit and sig-clipping.

    plotfit : str or False
        If this is a string, this function will make a plot for the fit to the
        mag/flux time-series and writes the plot to the path specified here.

    ignoreinitfail : bool
        If this is True, ignores the initial failure to find a set of optimized
        Fourier parameters using the global optimization function and proceeds
        to do a least-squares fit anyway.

    verbose : bool
        If True, will indicate progress and warn of any problems.

    curve_fit_kwargs : dict or None
        If not None, this should be a dict containing extra kwargs to pass to
        the scipy.optimize.curve_fit function.

    Returns
    -------

    dict
        This function returns a dict containing the model fit parameters, the
        minimized chi-sq value and the reduced chi-sq value. The form of this
        dict is mostly standardized across all functions in this module::

            {
                'fittype':'fourier',
                'fitinfo':{
                    'finalparams': the list of final model fit params,
                    'finalparamerrs': list of errs for each model fit param,
                    'fitmags': the model fit mags,
                    'fitperiod': the fit period if this wasn't set to fixed,
                    'fitepoch': this is times.min() for this fit type,
                    'actual_fitepoch': time of minimum light from fit model
                    ... other fit function specific keys ...
                },
                'fitchisq': the minimized value of the fit's chi-sq,
                'fitredchisq':the reduced chi-sq value,
                'fitplotfile': the output fit plot if fitplot is not None,
                'magseries':{
                    'times':input times in phase order of the model,
                    'phase':the phases of the model mags,
                    'mags':input mags/fluxes in the phase order of the model,
                    'errs':errs in the phase order of the model,
                    'magsarefluxes':input value of magsarefluxes kwarg
                }
            }

        NOTE: the returned value of 'fitepoch' in the 'fitinfo' dict returned by
        this function is the time value of the first observation since this is
        where the LC is folded for the fit procedure. To get the actual time of
        minimum epoch as calculated by a spline fit to the phased LC, use the
        key 'actual_fitepoch' in the 'fitinfo' dict.

    '''

    stimes, smags, serrs = sigclip_magseries(times,
                                             mags,
                                             errs,
                                             sigclip=sigclip,
                                             magsarefluxes=magsarefluxes)

    # get rid of zero errs
    nzind = npnonzero(serrs)
    stimes, smags, serrs = stimes[nzind], smags[nzind], serrs[nzind]

    phase, pmags, perrs, ptimes, mintime = (get_phased_quantities(
        stimes, smags, serrs, period))

    # get the fourier order either from the scalar order kwarg...
    if fourierorder and fourierorder > 0 and not fourierparams:

        fourieramps = [0.6] + [0.2] * (fourierorder - 1)
        fourierphas = [0.1] + [0.1] * (fourierorder - 1)
        fourierparams = fourieramps + fourierphas

    # or from the fully specified coeffs vector
    elif not fourierorder and fourierparams:

        fourierorder = int(len(fourierparams) / 2)

    else:
        LOGWARNING('specified both/neither Fourier order AND Fourier coeffs, '
                   'using default Fourier order of 3')
        fourierorder = 3
        fourieramps = [0.6] + [0.2] * (fourierorder - 1)
        fourierphas = [0.1] + [0.1] * (fourierorder - 1)
        fourierparams = fourieramps + fourierphas

    if verbose:
        LOGINFO('fitting Fourier series of order %s to '
                'mag series with %s observations, '
                'using period %.6f, folded at %.6f' %
                (fourierorder, len(phase), period, mintime))

    # initial minimize call to find global minimum in chi-sq
    initialfit = spminimize(_fourier_chisq,
                            fourierparams,
                            args=(phase, pmags, perrs))

    # make sure this initial fit succeeds before proceeding
    if initialfit.success or ignoreinitfail:

        if verbose:
            LOGINFO('initial fit done, refining...')

        leastsqparams = initialfit.x

        try:

            curvefit_params = npconcatenate((nparray([period]), leastsqparams))

            # set up the bounds for the fit parameters
            if fix_period:
                curvefit_bounds = ([period - 1.0e-7] +
                                   [-npinf] * fourierorder +
                                   [-npinf] * fourierorder,
                                   [period + 1.0e-7] + [npinf] * fourierorder +
                                   [npinf] * fourierorder)
            else:
                curvefit_bounds = ([0.0] + [-npinf] * fourierorder +
                                   [-npinf] * fourierorder,
                                   [npinf] + [npinf] * fourierorder +
                                   [npinf] * fourierorder)

            curvefit_func = partial(
                sinusoidal.fourier_curvefit_func,
                zerolevel=npmedian(smags),
                epoch=mintime,
                fixed_period=period if fix_period else None,
            )

            if curve_fit_kwargs is not None:

                finalparams, covmatrix = curve_fit(
                    curvefit_func,
                    stimes,
                    smags,
                    p0=curvefit_params,
                    sigma=serrs,
                    bounds=curvefit_bounds,
                    absolute_sigma=(not scale_errs_redchisq_unity),
                    **curve_fit_kwargs)

            else:

                finalparams, covmatrix = curve_fit(
                    curvefit_func,
                    stimes,
                    smags,
                    p0=curvefit_params,
                    sigma=serrs,
                    bounds=curvefit_bounds,
                    absolute_sigma=(not scale_errs_redchisq_unity),
                )

        except Exception:
            LOGEXCEPTION("curve_fit returned an exception")
            finalparams, covmatrix = None, None

        # if the fit succeeded, then we can return the final parameters
        if finalparams is not None and covmatrix is not None:

            # this is the fit period
            fperiod = finalparams[0]

            phase, pmags, perrs, ptimes, mintime = (get_phased_quantities(
                stimes, smags, serrs, fperiod))

            # calculate the chisq and reduced chisq
            fitmags = _fourier_func(finalparams[1:], phase, pmags)

            fitchisq = npsum(
                ((fitmags - pmags) * (fitmags - pmags)) / (perrs * perrs))

            n_free_params = len(pmags) - len(finalparams)
            if fix_period:
                n_free_params -= 1

            fitredchisq = fitchisq / n_free_params
            stderrs = npsqrt(npdiag(covmatrix))

            if verbose:
                LOGINFO('final fit done. chisq = %.5f, reduced chisq = %.5f' %
                        (fitchisq, fitredchisq))

            # figure out the time of light curve minimum (i.e. the fit epoch)
            # this is when the fit mag is maximum (i.e. the faintest)
            # or if magsarefluxes = True, then this is when fit flux is minimum
            if not magsarefluxes:
                fitmagminind = npwhere(fitmags == npmax(fitmags))
            else:
                fitmagminind = npwhere(fitmags == npmin(fitmags))
            if len(fitmagminind[0]) > 1:
                fitmagminind = (fitmagminind[0][0], )

            # assemble the returndict
            returndict = {
                'fittype': 'fourier',
                'fitinfo': {
                    'fourierorder': fourierorder,
                    # return coeffs only for backwards compatibility with
                    # existing functions that use the returned value of
                    # fourier_fit_magseries
                    'finalparams': finalparams[1:],
                    'finalparamerrs': stderrs,
                    'initialfit': initialfit,
                    'fitmags': fitmags,
                    'fitperiod': finalparams[0],
                    # the 'fitepoch' is just the minimum time here
                    'fitepoch': mintime,
                    # the actual fit epoch is calculated as the time of minimum
                    # light OF the fit model light curve
                    'actual_fitepoch': ptimes[fitmagminind]
                },
                'fitchisq': fitchisq,
                'fitredchisq': fitredchisq,
                'fitplotfile': None,
                'magseries': {
                    'times': ptimes,
                    'phase': phase,
                    'mags': pmags,
                    'errs': perrs,
                    'magsarefluxes': magsarefluxes
                },
            }

            # make the fit plot if required
            if plotfit and isinstance(plotfit, str):

                make_fit_plot(phase,
                              pmags,
                              perrs,
                              fitmags,
                              fperiod,
                              mintime,
                              mintime,
                              plotfit,
                              magsarefluxes=magsarefluxes)

                returndict['fitplotfile'] = plotfit

            return returndict

        # if the leastsq fit did not succeed, return Nothing
        else:
            LOGERROR(
                'fourier-fit: least-squared fit to the light curve failed')
            return {
                'fittype': 'fourier',
                'fitinfo': {
                    'fourierorder': fourierorder,
                    'finalparams': None,
                    'finalparamerrs': None,
                    'initialfit': initialfit,
                    'fitmags': None,
                    'fitperiod': None,
                    'fitepoch': None,
                    'actual_fitepoch': None,
                },
                'fitchisq': npnan,
                'fitredchisq': npnan,
                'fitplotfile': None,
                'magseries': {
                    'times': ptimes,
                    'phase': phase,
                    'mags': pmags,
                    'errs': perrs,
                    'magsarefluxes': magsarefluxes
                }
            }

    # if the fit didn't succeed, we can't proceed
    else:

        LOGERROR('initial Fourier fit did not succeed, '
                 'reason: %s, returning scipy OptimizeResult' %
                 initialfit.message)

        return {
            'fittype': 'fourier',
            'fitinfo': {
                'fourierorder': fourierorder,
                'finalparams': None,
                'finalparamerrs': None,
                'initialfit': initialfit,
                'fitmags': None,
                'fitperiod': None,
                'fitepoch': None,
                'actual_fitepoch': None,
            },
            'fitchisq': npnan,
            'fitredchisq': npnan,
            'fitplotfile': None,
            'magseries': {
                'times': ptimes,
                'phase': phase,
                'mags': pmags,
                'errs': perrs,
                'magsarefluxes': magsarefluxes
            }
        }
Ejemplo n.º 20
0
def add_pkg_to_local_db(new_row, local_db):
    """
    Adds a package entry to the local database file

    Parameters
    ----------
    new_row : ``astropy.table.Row``
        The package entry data

    local_db : str, ``astropy.Table``
        Eith


    Returns
    -------

    """

    if isinstance(local_db, str):
        local_table = get_local_packages(local_db)
    elif isinstance(local_db, Table):
        local_table = local_db
    else:
        raise ValueError("local_db must be either Table or path to DB file")

    if type(new_row) != Row:
        raise ValueError("pkg_entry must be an astropy.table.Row object")

    if new_row["name"] in local_table["name"]:
        ii = npwhere(local_table["name"] == new_row["name"])[0][0]
        fmt = '%Y-%m-%d'
        local_date = dt.datetime.strptime(local_table[ii]["date_modified"],
                                          fmt)
        new_date = dt.datetime.strptime(new_row["date_modified"], fmt)

        if new_date >= local_date:

            dic = {col: local_table[ii][col] for col in local_table.colnames}
            dic["name"] = dic["name"] + "_" + dic["date_modified"]
            new_tbl = Table(names=[col for col in dic],
                            data=[[dic[col]] for col in dic])
            new_tbl_2 = Table(names=[col for col in new_row.colnames],
                              data=[[new_row[col]]
                                    for col in new_row.colnames])
            tbl_to_add = vstack([new_tbl, new_tbl_2])

            local_table.remove_row(ii)
        else:
            dic = {col: new_row[col] for col in new_row.colnames}
            dic["name"] = dic["name"] + "_" + dic["date_modified"]
            tbl_to_add = Table(names=[col for col in dic],
                               data=[[dic[col]] for col in dic])

    else:
        tbl_to_add = Table(names=[col for col in new_row.colnames],
                           data=[[new_row[col]] for col in new_row.colnames])

    new_local_table = vstack([local_table, tbl_to_add])
    new_local_table.meta = local_table.meta

    return new_local_table
Ejemplo n.º 21
0
def GetProbTot_Founders(H,
                        G,
                        G_offspring,
                        ReadsLST,
                        ploidy_levels,
                        error_rate,
                        plog=False,
                        QscoresLST=None,
                        usecounts=False,
                        Impute_Incompatible=True,
                        Impute_Missing=True,
                        redose=False):
    """Determine the set of distinct extensions for a pair of founder base haplotypes H=(Hm, Hf) to position s using their genotypes at s G=(Gm, Gf), 
        calculate their prior weights and report the read-probabilities conditional on each extension. In case usecounts is True, also report the number 
        of observed reads compatible with each homologue, so that the upstream functions may set a threshold on the minimum number of reads compatible with 
        each homologue."""
    global genotype_err
    Returned_Imputation = None
    done, attempt = False, 1
    Imputation = [
        Genotype(_G.GetS(), _G.GetPos(), *_G.GetGenes()) for _G in G
    ]  # Parental genotypes are needed at s to extend H
    Imputation = Imputation + [
        [Genotype(_Gc.GetS(), _Gc.GetPos(), *_Gc.GetGenes()) for _Gc in _Glst]
        for _Glst in G_offspring
    ]  # Offspring genotypes are needed from SNP 1 to s to check compatiblity with H. Nevertheless, only the SNPs at s-1 and s are currently used to check compatiblity.
    Impute_Missing = Impute_Missing and (
        any([_Gp.isMISSING() for _Gp in G])
        or any([_Glst[-1].isMISSING() for _Glst in G_offspring])
    )  # if no genotype is missing, Impute_Missing can be set to False
    if redose or Impute_Missing:  # if redose, reassign all of the genotypes. Otherwise, if Impute_Missing is True and missing genotypes exist at s (for some of the parents or the offspring), try to impute them!
        if not redose:
            garbage = sys.stderr.write(
                'WARNING: Missing offspring genotypes will be imputed at s={0:d}, position {1:d}!\n'
                .format(G[0].GetS() + 1, G[0].GetPos()))
        try:
            if not redose:
                _Imputations = ImputeGenotype(
                    G[0].GetS(),
                    ReadsLST,
                    ploidy_levels,
                    error_rate,
                    QscoresLST,
                    FixedGenos=Imputation[0:2] +
                    [_Glst[-1] for _Glst in Imputation[2:]])
            else:
                _Imputations = ImputeGenotype(G[0].GetS(),
                                              ReadsLST,
                                              ploidy_levels,
                                              error_rate,
                                              QscoresLST,
                                              FixedGenos=None)
        except BlockException as e:
            sys.stderr.write(
                'WARNING: ' + ''.join(e.args) +
                " Failed to {2:s} at SNP {0:d}, position {1:d}!\n".format(
                    G[0].GetS() + 1, G[0].GetPos(), "estimate dosages"
                    if redose else "impute missing genotypes"))
            if redose:
                _Imputations = [
                    ('-', ) * ploidy_levels[0], ('-', ) * ploidy_levels[1]
                ] + [
                    tuple('-' for _x in range(0,
                                              sum(ploidy_levels) // 2))
                    for _Glst in G_offspring
                ]
            else:
                _Imputations = [_G.GetGenes() for _G in G] + [
                    tuple('-' for _x in range(0,
                                              sum(ploidy_levels) // 2))
                    if _Glst[-1].isMISSING() else _Glst[-1].GetGenes()
                    for _Glst in G_offspring
                ]
        except:
            raise
        Imputation = [
            Genotype(G[0].GetS(), G[0].GetPos(), *_alleles)
            for _alleles in _Imputations[0:2]
        ] + [[
            Genotype(_Gc.GetS(), _Gc.GetPos(), *_Gc.GetGenes())
            for _Gc in _Glst[:-1]
        ] + [Genotype(_Glst[-1].GetS(), _Glst[-1].GetPos(), *_alleles)]
             for _Glst, _alleles in zip(G_offspring, _Imputations[2:])]
        Returned_Imputation = [
            Genotype(_Gp.GetS(), _Gp.GetPos(), *_alleles)
            for _Gp, _alleles in zip(G, _Imputations[0:2])
        ] + [
            Genotype(_Glst[-1].GetS(), _Glst[-1].GetPos(), *_alleles)
            for _Glst, _alleles in zip(G_offspring, _Imputations[2:])
        ]  # This will be passed to the caller function to show imputationhas occured
    while not done and attempt <= 2:
        perm = [
            makePermutation(_G) for _G in Imputation[0:2]
        ]  # (distinct) permutations of maternal and paternal genoptypes
        probs = [
        ]  # the probability of Semi Reads at s conditional on (Hp, H, eps)
        weights = []  # the prior pobability of Hp conditional on (H, eps)
        Uniques = []  # distinct Hp's
        Counts = []  # Number of reads compatible with each homologue
        for P1 in perm[
                0]:  # evaluate all of the possible extensions for the base haplotypes of each parent
            for P2 in perm[1]:
                Hp = (H[0] + P1, H[1] + P2)
                if Hp not in Uniques:
                    Uniques.append(Hp)
                    if usecounts:
                        _prob, _counts = GetProbReads_Founders(
                            ReadsLST, Hp, error_rate, plog, QscoresLST, True)
                        probs.append(_prob)
                        Counts.append(_counts)
                    else:
                        probs.append(
                            GetProbReads_Founders(ReadsLST, Hp, error_rate,
                                                  plog, QscoresLST))
                        Counts.append(None)
                    ploidies = [len(Hp[0].GetVS()), len(Hp[1].GetVS())]
                    Candid_Offspring_Extensions_Hp = []
                    for _megagamete in Gametogenesis(
                            Haplotypes(
                                Hp[0].GetStop() - 1, Hp[0].GetStop(), 0, 0,
                                None, None, *[
                                    tuple(_h[len(_h) - 2:len(_h)])
                                    for _h in Hp[0].GetVS()
                                ])
                    ):  # obtain and store all of the possible offspring haplotypes from the parents assuming no recombination
                        for _microgamete in Gametogenesis(
                                Haplotypes(
                                    Hp[1].GetStop() - 1, Hp[1].GetStop(), 0, 0,
                                    None, None, *[
                                        tuple(_h[len(_h) - 2:len(_h)])
                                        for _h in Hp[1].GetVS()
                                    ])):
                            Candid_Offspring_Extensions_Hp.append(
                                Haplotypes(Hp[0].GetStop() - 1,
                                           Hp[0].GetStop(), 0, 0, None, None,
                                           *(_megagamete + _microgamete)))
                    _prior = 0  # the prior weight of a candidate founder extension
                    #_prior = 1 # the prior weight of a candidate founder extension
                    total_offspring_phasings_possible = len(
                        Candid_Offspring_Extensions_Hp)
                    for _id in range(2, len(Imputation)):
                        #_min_number_of_errors = 2 # Mininum number of genotype incompatibilities for each offspring at s-1 and s, assumign a candidate parental extension Hp (min = 0 and max = 2, naturally!)
                        _number_of_compatible_phasings = 0
                        for _Hc in Candid_Offspring_Extensions_Hp:
                            #_error_Hc = Check_Genotype_Compatibility(_Hc, Imputation[_id], 0, give_number_of_incompatibles=True)
                            #if _error_Hc < _min_number_of_errors:
                            #        _min_number_of_errors = _error_Hc
                            if Check_Genotype_Compatibility(
                                    _Hc, Imputation[_id], 0):
                                _number_of_compatible_phasings += 1
                            else:
                                pass
                        _prior += (_number_of_compatible_phasings *
                                   _number_of_compatible_phasings)
                        #print(_prior)
                        #_prior*=misc.comb(2,_min_number_of_errors)*genotype_err**_min_number_of_errors*(1-genotype_err)**(2-_min_number_of_errors)
                    #weights.append(_prior/(1e-60+len(Candid_Offspring_Extensions_Hp)*(len(Imputation)-2))) # P(Hm,Hf|ReadLST)=P(ReadLST|Hm,Hf)P(Hm,Hf)=P(ReadsLST|Hm,Hf)P(Offspring Genotypes|Hm, Hf)
                    #if weights[-1]>1e-10:
                    #    weights[-1]=1
                    weights.append(
                        float(_prior) / (total_offspring_phasings_possible *
                                         total_offspring_phasings_possible))
                    for _Hp in Hp:
                        _npVset = []
                        for _v in _Hp.GetVS():
                            _npv = array(_v)
                            _npVset.append(
                                npdel(_npv, npwhere(_npv == '-')).tolist())
                        try:
                            weights[-1] *= (2**GetLogProbH(
                                Haplotypes(
                                    1, 2, 1,
                                    loge(
                                        len(
                                            set(
                                                itertools.permutations(
                                                    tuple(
                                                        (_v[-2], _v[-1])
                                                        for _v in _npVset))))),
                                    None, None,
                                    *tuple(
                                        (_v[-2], _v[-1]) for _v in _npVset))))
                        except IndexError:
                            pass
                    #weights[-1]*=4**sum(1 for x in Hp[0].GetVS() if str(x[-1])==str(x[-2]) and str(x[-1])=='0')
                    #weights[-1]*=4**sum(1 for x in Hp[1].GetVS() if str(x[-1])==str(x[-2]) and str(x[-1])=='0')
                    #weights.append(_prior) # P(Hm,Hf|ReadLST)=P(ReadLST|Hm,Hf)P(Hm,Hf)=P(ReadsLST|Hm,Hf)P(Offspring Genotypes|Hm, Hf)
                    #if _prior>1e-10:
                    #        weights.append(1) # uninformative prior
                    #else:
                    #        weights.append(0) # incompatible extension
                else:
                    pass
        #all_incompatible_prob = (1+1e-10)*(genotype_err**2)**(len(Imputation)-2)
        if all(
                _x < 1e-60 for _x in weights
        ):  # if no offspring extension derived from the parental extensions is compatible with the offspring genotypes, estimate the offspring genotype at s anew. This condition is NOT expected to occur with "redose" set to True. Check if all weights are zero taking numerical uncertainty into account.
            #if all(_x<all_incompatible_prob for _x in weights):# if no offspring extension derived from the parental extensions is compatible with the offspring genotypes, estimate the offspring genotype at s anew. This condition is NOT expected to occur with "redose" set to True. Check if all weights are zero taking numerical uncertainty into account.
            if not Impute_Incompatible:
                sys.stderr.write(
                    "WARNING: Parental genotypes were incompatible with the offspring genotypes! Extension will be skipped at SNP {0:d}, position {1:d}!\n"
                    .format(Imputation[0].GetS() + 1, Imputation[0].GetPos()))
                Uniques, weights = [], []
                done = True
            else:
                attempt += 1
                if attempt <= 2:
                    sys.stderr.write(
                        "WARNING: Parental genotypes were incompatible with the offspring genotypes! All of the genotypes will be imputed anew at SNP {0:d}, position {1:d}!\n"
                        .format(Imputation[0].GetS() + 1,
                                Imputation[0].GetPos()))
                    _Imputations = Imputation[0:2] + [
                        _Glst[-1] for _Glst in Imputation[2:]
                    ]
                    try:
                        _Imputations = [
                            Genotype(_G.GetS(), _G.GetPos(), *_alleles)
                            for _G, _alleles in zip(
                                _Imputations,
                                ImputeGenotype(G[0].GetS(), ReadsLST,
                                               ploidy_levels, error_rate,
                                               QscoresLST, None))
                        ]
                    except BlockException as e:
                        sys.stderr.write(
                            'WARNING: ' + ''.join(e.args) +
                            " Extension will be skipped at SNP {0:d}, position {1:d}!\n"
                            .format(Imputation[0].GetS() +
                                    1, Imputation[0].GetPos()))
                        Uniques, weights = [], []
                        done = True
                    except:
                        raise
                    else:
                        Returned_Imputation = [_G for _G in _Imputations]
                        Imputation = _Imputations[0:2] + [[
                            Genotype(_Gc.GetS(), _Gc.GetPos(), *_Gc.GetGenes())
                            for _Gc in _Glst[:-1]
                        ] + [_Impute] for _Glst, _Impute in zip(
                            G_offspring, _Imputations[2:])]
                #weights = [1./len(weights) for _w in weights] # Uninformative prior
                #done = True
        else:
            done = True
    if not done:
        sys.stderr.write(
            "WARNING: Parental genotypes were still incompatible with the offspring after imputation! Extension will be therefore skipped at SNP {0:d}, position {1:d}!\n"
            .format(Imputation[0].GetS() + 1, Imputation[0].GetPos()))
        Uniques, weights = [], []
    _norm = float(sum(weights))
    weights = [_w / _norm for _w in weights]
    return Uniques, weights, probs, Counts, Returned_Imputation
Ejemplo n.º 22
0
def where(mask, yes, no):
    return from_array(npwhere(mask, yes, no))
Ejemplo n.º 23
0
def simple_flare_find(times, mags, errs,
                      smoothbinsize=97,
                      flareminsigma=4.0,
                      flaremaxcadencediff=1,
                      flaremincadencepoints=3,
                      magsarefluxes=False,
                      savgolpolyorder=2,
                      **savgolkwargs):
    '''This finds flares in  time series using the method in Walkowicz+ 2011.

    Returns number of flares found, and their time indices.

    Args
    ----

    times, mags, errs are numpy arrays for the time series.

    Kwargs
    ------

    smoothbinsize: the number of consecutive light curve points to smooth over
    in the time series using a Savitsky-Golay filter. The smoothed light curve
    is then subtracted from the actual light curve to remove trends that
    potentially last smoothbinsize light curve points. The default value is
    chosen as ~6.5 hours (97 x 4 minute cadence for HATNet/HATSouth).

    flareminsigma: the minimum sigma above the median light curve level to
    designate points as belonging to possible flares

    flaremaxcadencediff: the maximum number of light curve points apart each
    possible flare event measurement is allowed to be. If this is 1, then we'll
    look for consecutive measurements.

    flaremincadencepoints: the minimum number of light curve points (each
    flaremaxcadencediff points apart) required that are at least flareminsigma
    above the median light curve level to call an event a flare.

    magsarefluxes: if True, indicates that mags is actually an array of fluxes.

    savgolpolyorder: the polynomial order of the function used by the
    Savitsky-Golay filter.

    Any remaining keyword arguments are passed directly to the savgol_filter
    function from scipy.

    '''

    # if no errs are given, assume 0.1% errors
    if errs is None:
        errs = 0.001*mags

    # get rid of nans first
    finiteind = npisfinite(times) & npisfinite(mags) & npisfinite(errs)
    ftimes = times[finiteind]
    fmags = mags[finiteind]
    ferrs = errs[finiteind]

    # now get the smoothed mag series using the filter
    # kwargs are provided to the savgol_filter function
    smoothed = savgol_filter(fmags,
                             smoothbinsize,
                             savgolpolyorder,
                             **savgolkwargs)
    subtracted = fmags - smoothed

    # calculate some stats
    # the series_median is ~zero after subtraction
    series_mad = npmedian(npabs(subtracted))
    series_stdev = 1.483*series_mad

    # find extreme positive deviations
    if magsarefluxes:
        extind = npwhere(subtracted > (minflaresigma*series_stdev))
    else:
        extind = npwhere(subtracted < (-minflaresigma*series_stdev))

    # see if there are any extrema
    if extind and extind[0]:

        extrema_indices = extind[0]
        flaregroups = []

        # find the deviations within the requested flaremaxcadencediff
        for ind, extrema_index in enumerate(extrema_indices):

            stuff_to_do()