def __init__(self, headername, liststations, filename, outputdir): self.filename = filename self.outputdir = outputdir aa = read_ukmo(headername, liststations, filename) # unique IDs ids = npunique(aa.csvdata['ID']) for identifier in ids: try: lon = aa.stationdata[identifier.strip()]['longitude'] lat = aa.stationdata[identifier.strip()]['latitude'] elevation = aa.stationdata[identifier.strip()]['elevation'] except KeyError: continue if (lon == -99999) or (lat == -99999) or (elevation == -99999): continue # list of indices idx = npwhere(aa.csvdata['ID'] == identifier)[0] # extract all keys for selected station identifier dataout = collections.defaultdict(list) dataout = dict( (k, nparray(aa.csvdata[k])[idx]) for k in aa.csvdata.keys()) stationid = dataout['ID'][0] # remove variables from dictionary dataout.pop('longitude', None) dataout.pop('latitude', None) dataout.pop('elevation', None) dataout.pop('ID', None) # create netcdf file filename = self.define_output_file(stationid) self.write_netcdf(filename, dataout, lon, lat, elevation)
def GetProbTot(H, G, Reads, error_rate, plog = False, Qscores = None, usecounts=False): """Determine the set of distinct extensions, calculate their prior weights and report the read-probabilities conditional on each extension. In case usecounts is True, also report the minimum number of reads compatible with each homologue, so that the upstream functions may set a threshold on the minimum number of reads compatible with each homologue.""" perm = makePermutation(G) # distinct permutations of a genoptype probs = [] # the probability of SR(s) conditional on (Hp, H, eps) weights = [] # the prior pobability of Hp conditional on (H, eps) Uniques = [] # distinct Hp's Mins = [] for P in perm: Hp = H + P if Hp not in Uniques: Uniques.append(Hp) if usecounts: _prob, _counts = GetProbReads(Reads, Hp, error_rate, plog, Qscores, True) probs.append(_prob) Mins.append(min(_counts)) else: probs.append(GetProbReads(Reads, Hp, error_rate, plog, Qscores)) _npVset = [] for _v in Hp.GetVS(): _npv = array(_v) _npVset.append(npdel(_npv, npwhere(_npv=='-')).tolist()) try: weights.append(exp(GetLogProbH(Haplotypes(1, 2, 1, log(len(set(itertools.permutations(tuple(( _v[-2],_v[-1]) for _v in _npVset))))), None, None, *[(_v[-2],_v[-1]) for _v in _npVset])))) except IndexError: weights.append(1) #weights.append(exp(GetLogProbH(Hp))) #weights.append(1) else: pass wsum = float(sum(weights)) weights = [_w/wsum for _w in weights] return Uniques, weights, probs, Mins
def add_uniprot_descriptors(df, debug): ''' Adds uniprot category flags from preprocessed uniprot feature set Can handle holder since it just relies on residue having a uniprot position ''' current_unp = df.iloc[0].uniprot global UNP_DF if UNP_DF is None: UNP_DF = load_uniprot_df() #TODO: move this to do at the beginning in main debug_head = "DEBUG: descriptors: add_uniprot_descriptors: " if debug: print debug_head+"Adding uniprot category descriptors" unpdf = UNP_DF[UNP_DF.uniprot==current_unp].drop(['uniprot'],axis=1) if unpdf.empty: # Probably don't need to do this as nulls are filled in at the end #No features assigned to this uniprot, need to use holders if debug: print debug_head+"No uniprot features found for uniprot {}, using holders".format(current_unp) holderdict = {'uniprot_position': list(df.uniprot_position.unique())} totalres = len(holderdict['uniprot_position']) for h in HEADERS['unp']: holderdict[h] = [HOLDERS[h]]*totalres unpdf = pd.DataFrame.from_dict(holderdict) elif debug: print debug_head+"Uniprot features found for {} residues in {}".format(len(unpdf.index),current_unp) ndf = pd.merge(df,unpdf,how="left",on='uniprot_position') for h in HEADERS['unp']: ndf[h] = npwhere(ndf[h].isnull(),HOLDERS[h],ndf[h]) return ndf.drop_duplicates()
def find_carryover_indexes(high_cup_name, low_cup_name, analysis_cups): """ Finds the indexes for the carryover samples, has to be slightly different to accomodate both :param high_cup_name: :param low_cup_name: :param analysis_cups: :return: """ # TODO: could get rid of this function and just use find_cup_indexes twice for both of the carryover samples... a_c = nparray(analysis_cups) high_index = npwhere(a_c == high_cup_name) low_indexes = npwhere(a_c == low_cup_name) clean_high = [x for x in high_index[0]] clean_low = [x for x in low_indexes[0]] return clean_high, clean_low
def get_start_stop_interval_idxs(timestamps_, new_timestamps_): """ returns the new_timestamps start and stop indexes inside the timestamps interval """ nts = npfromiter(new_timestamps_, npfloat) nts = npwhere((nts >= timestamps_[0]) & (nts <= timestamps_[-1]))[0] if nts.size != 0: return nts[0], nts[-1] else: return None, None
def get_package_table_entry(pkg_name, db_table): """Returns an astropy.table.Row object with the package data""" # ::todo implement multiple entry handling if pkg_name in db_table["name"]: pkg_index = npwhere(db_table["name"] == pkg_name)[0] pkg_entry = db_table[pkg_index[0]] else: pkg_entry = None return pkg_entry
def find_cup_indexes(specified_cup, analysis_cups): """ Finds the indexes of a cup type of interest, e.g. indexes of DRIF returns peak indexes for Drift samples :param specified_cup: :param analysis_cups: :return: clean_indexes """ a_c = nparray(analysis_cups) indexes = npwhere(a_c == specified_cup) clean_indexes = [int(x) for x in indexes[0]] return clean_indexes
def attach_descriptors(self, partner): ''' Attach descriptors to partner df Requires a partner df Does this by merge (chain/res#/icode) if not a holder If holder, attaches using bind returns the merged/binded result ''' if self.debug: print self.debug_head+"Attaching descriptors to df with {} rows".format(len(partner.index)) if self.descriptors is None: return partner if self.id == "Holder": # Generate a holder descriptor table desc_dict = dict() nrow = len(partner.index) for d in self.header: desc_dict[d] = [HOLDERS[d]]*nrow holder_desc = pd.DataFrame.from_dict(desc_dict) holder_desc.set_index(partner.index) if self.debug: print self.debug_head+"Concatenating holder descriptors" return pd.concat([partner,holder_desc[self.header]],axis=1) else: # if self.filename=="url": # self.descriptors[self.header].to_csv("test1") # partner.to_csv("test2") # print self.res_header # print self.descriptors[self.res_header] # print partner[['structure','icode','chain']] # with open("tmp","a") as outfile: # outfile.write("{}\ndf:{} = {}\npartner:{} = {}\n".format(self.id,list(self.descriptors[self.header]),self.descriptors[self.header].dtypes,list#(partner),partner.dtypes)) # print partner.merge(self.descriptors[self.header],how='left',on=self.res_header) # print self.descriptors[self.header] # print partner newdf = partner.merge(self.descriptors[self.header], how='left', on=self.res_header) # Fill in any residues that were missing descriptors for x in self.header: if x in self.res_header: continue newdf[x] = npwhere(newdf[x].isnull(),HOLDERS[x],newdf[x]) return newdf
def _metadata_velox(path): # parameterized by path rather than emd_obj so that hashing lru hashing resolves easily metaData = {} metaData['veloxFlag'] = True metaData['FileName'] = path emd_obj = emdVelox.fileEMDVelox(path) dataGroup = emd_obj.list_data[0] dataset0 = dataGroup['Data'] # Convert JSON metadata to dict mData = emd_obj.list_data[0]['Metadata'][:, 0] validMetaDataIndex = npwhere(mData > 0) # find valid metadata mData = mData[validMetaDataIndex].tostring() # change to string mDataS = json.loads(mData.decode('utf-8', 'ignore')) # load UTF-8 string as JSON and output dict try: # Store the X and Y pixel size, offset and unit metaData['PhysicalSizeX'] = float(mDataS['BinaryResult']['PixelSize']['width']) metaData['PhysicalSizeXOrigin'] = float(mDataS['BinaryResult']['Offset']['x']) metaData['PhysicalSizeXUnit'] = mDataS['BinaryResult']['PixelUnitX'] metaData['PhysicalSizeY'] = float(mDataS['BinaryResult']['PixelSize']['height']) metaData['PhysicalSizeYOrigin'] = float(mDataS['BinaryResult']['Offset']['y']) metaData['PhysicalSizeYUnit'] = mDataS['BinaryResult']['PixelUnitY'] except: metaData['PhysicalSizeX'] = 1 metaData['PhysicalSizeXOrigin'] = 0 metaData['PhysicalSizeXUnit'] = '' metaData['PhysicalSizeY'] = 1 metaData['PhysicalSizeYOrigin'] = 0 metaData['PhysicalSizeYUnit'] = '' metaData.update(mDataS) metaData['shape'] = dataset0.shape return metaData
def legendre_fit_magseries(times, mags, errs, period, legendredeg=10, sigclip=30.0, plotfit=False, magsarefluxes=False, verbose=True): ''' Fit an arbitrary-order Legendre series, via least squares, to the magnitude/flux time series. This is a series of the form: p(x) = c_0*L_0(x) + c_1*L_1(x) + c_2*L_2(x) + ... + c_n*L_n(x) where L_i's are Legendre polynomials (also caleld "Legendre functions of the first kind") and c_i's are the coefficients being fit. Args: legendredeg (int): n in the above equation. (I.e., if you give n=5, you will get 6 coefficients). This number should be much less than the number of data points you are fitting. sigclip (float): number of standard deviations away from the mean of the magnitude time-series from which to "clip" data points. magsarefluxes (bool): sets the ylabel and ylimits of plots for either magnitudes (False) or flux units (i.e. normalized to 1, in which case magsarefluxes should be set to True). Returns: returndict: { 'fittype':'legendre', 'fitinfo':{ 'legendredeg':legendredeg, 'fitmags':fitmags, 'fitepoch':magseriesepoch }, 'fitchisq':fitchisq, 'fitredchisq':fitredchisq, 'fitplotfile':None, 'magseries':{ 'times':ptimes, 'phase':phase, 'mags':pmags, 'errs':perrs, 'magsarefluxes':magsarefluxes}, } where `fitmags` is the values of the fit function interpolated onto magseries' `phase`. This function is mainly just a wrapper to numpy.polynomial.legendre.Legendre.fit. ''' stimes, smags, serrs = sigclip_magseries(times, mags, errs, sigclip=sigclip, magsarefluxes=magsarefluxes) # get rid of zero errs nzind = npnonzero(serrs) stimes, smags, serrs = stimes[nzind], smags[nzind], serrs[nzind] phase, pmags, perrs, ptimes, mintime = (_get_phased_quantities( stimes, smags, serrs, period)) if verbose: LOGINFO('fitting Legendre series with ' 'maximum Legendre polynomial order %s to ' 'mag series with %s observations, ' 'using period %.6f, folded at %.6f' % (legendredeg, len(pmags), period, mintime)) # Least squares fit of Legendre polynomial series to the data. The window # and domain (see "Using the Convenience Classes" in the numpy # documentation) are handled automatically, scaling the times to a minimal # domain in [-1,1], in which Legendre polynomials are a complete basis. p = Legendre.fit(phase, pmags, legendredeg) coeffs = p.coef fitmags = p(phase) # Now compute the chisq and red-chisq. fitchisq = npsum(((fitmags - pmags) * (fitmags - pmags)) / (perrs * perrs)) nparams = legendredeg + 1 fitredchisq = fitchisq / (len(pmags) - nparams - 1) if verbose: LOGINFO('Legendre fit done. chisq = %.5f, reduced chisq = %.5f' % (fitchisq, fitredchisq)) # figure out the time of light curve minimum (i.e. the fit epoch) # this is when the fit mag is maximum (i.e. the faintest) # or if magsarefluxes = True, then this is when fit flux is minimum if not magsarefluxes: fitmagminind = npwhere(fitmags == npmax(fitmags)) else: fitmagminind = npwhere(fitmags == npmin(fitmags)) magseriesepoch = ptimes[fitmagminind] # assemble the returndict returndict = { 'fittype': 'legendre', 'fitinfo': { 'legendredeg': legendredeg, 'fitmags': fitmags, 'fitepoch': magseriesepoch, 'finalparams': coeffs, }, 'fitchisq': fitchisq, 'fitredchisq': fitredchisq, 'fitplotfile': None, 'magseries': { 'times': ptimes, 'phase': phase, 'mags': pmags, 'errs': perrs, 'magsarefluxes': magsarefluxes } } # make the fit plot if required if plotfit and isinstance(plotfit, str): _make_fit_plot(phase, pmags, perrs, fitmags, period, mintime, magseriesepoch, plotfit, magsarefluxes=magsarefluxes) returndict['fitplotfile'] = plotfit return returndict
def savgol_fit_magseries(times, mags, errs, period, windowlength=None, polydeg=2, sigclip=30.0, plotfit=False, magsarefluxes=False, verbose=True): ''' Fit a Savitzky-Golay filter to the magnitude/flux time series. SG fits successive sub-sets (windows) of adjacent data points with a low-order polynomial via least squares. At each point (magnitude), it returns the value of the polynomial at that magnitude's time. This is made significantly cheaper than *actually* performing least squares for each window through linear algebra tricks that are possible when specifying the window size and polynomial order beforehand. Numerical Recipes Ch 14.8 gives an overview, Eq. 14.8.6 is what Scipy has implemented. The idea behind Savitzky-Golay is to preserve higher moments (>=2) of the input data series than would be done by a simple moving window average. Note that the filter assumes evenly spaced data, which magnitude time series are not. By *pretending* the data points are evenly spaced, we introduce an additional noise source in the function values. This is a relatively small noise source provided that the changes in the magnitude values across the full width of the N=windowlength point window is < sqrt(N/2) times the measurement noise on a single point. Args: windowlength (int): length of the filter window (the number of coefficients). Must be either positive and odd, or None. (The window is the number of points to the left, and to the right, of whatever point is having a polynomial fit to it locally). Bigger windows at fixed polynomial order risk lowering the amplitude of sharp features. If None, this routine (arbitrarily) sets the windowlength for phased LCs to be either the number of finite data points divided by 300, or polydeg+3, whichever is bigger. polydeg (int): the order of the polynomial used to fit the samples. Must be less than windowlength. "Higher-order filters do better at preserving feature heights and widths, but do less smoothing on broader features." (NumRec). magsarefluxes (bool): sets the ylabel and ylimits of plots for either magnitudes (False) or flux units (i.e. normalized to 1, in which case magsarefluxes should be set to True). ''' stimes, smags, serrs = sigclip_magseries(times, mags, errs, sigclip=sigclip, magsarefluxes=magsarefluxes) # get rid of zero errs nzind = npnonzero(serrs) stimes, smags, serrs = stimes[nzind], smags[nzind], serrs[nzind] phase, pmags, perrs, ptimes, mintime = (_get_phased_quantities( stimes, smags, serrs, period)) if not isinstance(windowlength, int): windowlength = max(polydeg + 3, int(len(phase) / 300)) if windowlength % 2 == 0: windowlength += 1 if verbose: LOGINFO('applying Savitzky-Golay filter with ' 'window length %s and polynomial degree %s to ' 'mag series with %s observations, ' 'using period %.6f, folded at %.6f' % (windowlength, polydeg, len(pmags), period, mintime)) # generate the function values obtained by applying the SG filter. The # "wrap" option is best for phase-folded LCs. sgf = savgol_filter(pmags, windowlength, polydeg, mode='wrap') # here the "fit" to the phases is the function produced by the # Savitzky-Golay filter. then compute the chisq and red-chisq. fitmags = sgf fitchisq = npsum(((fitmags - pmags) * (fitmags - pmags)) / (perrs * perrs)) # TODO: quantify dof for SG filter. nparams = int(len(pmags) / windowlength) * polydeg fitredchisq = fitchisq / (len(pmags) - nparams - 1) fitredchisq = -99. if verbose: LOGINFO('SG filter applied. chisq = %.5f, reduced chisq = %.5f' % (fitchisq, fitredchisq)) # figure out the time of light curve minimum (i.e. the fit epoch) # this is when the fit mag is maximum (i.e. the faintest) # or if magsarefluxes = True, then this is when fit flux is minimum if not magsarefluxes: fitmagminind = npwhere(fitmags == npmax(fitmags)) else: fitmagminind = npwhere(fitmags == npmin(fitmags)) magseriesepoch = ptimes[fitmagminind] # assemble the returndict returndict = { 'fittype': 'savgol', 'fitinfo': { 'windowlength': windowlength, 'polydeg': polydeg, 'fitmags': fitmags, 'fitepoch': magseriesepoch }, 'fitchisq': fitchisq, 'fitredchisq': fitredchisq, 'fitplotfile': None, 'magseries': { 'times': ptimes, 'phase': phase, 'mags': pmags, 'errs': perrs, 'magsarefluxes': magsarefluxes } } # make the fit plot if required if plotfit and isinstance(plotfit, str): _make_fit_plot(phase, pmags, perrs, fitmags, period, mintime, magseriesepoch, plotfit, magsarefluxes=magsarefluxes) returndict['fitplotfile'] = plotfit return returndict
def spline_fit_magseries(times, mags, errs, period, knotfraction=0.01, maxknots=30, sigclip=30.0, plotfit=False, ignoreinitfail=False, magsarefluxes=False, verbose=True): '''This fits a univariate cubic spline to the phased light curve. This fit may be better than the Fourier fit for sharply variable objects, like EBs, so can be used to distinguish them from other types of variables. The knot fraction is the number of internal knots to use for the spline. A value of 0.01 (or 1%) of the total number of non-nan observations appears to work quite well, without over-fitting. maxknots controls the maximum number of knots that will be allowed. magsarefluxes is a boolean value for setting the ylabel and ylimits of plots for either magnitudes (False) or flux units (i.e. normalized to 1, in which case magsarefluxes should be set to True). Returns the chisq of the fit, as well as the reduced chisq. FIXME: check this equation below to see if it's right. reduced_chisq = fit_chisq/(len(pmags) - len(knots) - 1) ''' # this is required to fit the spline correctly if errs is None: errs = npfull_like(mags, 0.005) # sigclip the magnitude time series stimes, smags, serrs = sigclip_magseries(times, mags, errs, sigclip=sigclip, magsarefluxes=magsarefluxes) # get rid of zero errs nzind = npnonzero(serrs) stimes, smags, serrs = stimes[nzind], smags[nzind], serrs[nzind] # phase the mag series phase, pmags, perrs, ptimes, mintime = (_get_phased_quantities( stimes, smags, serrs, period)) # now figure out the number of knots up to max knots (=100) nobs = len(phase) nknots = int(npfloor(knotfraction * nobs)) nknots = maxknots if nknots > maxknots else nknots splineknots = nplinspace(phase[0] + 0.01, phase[-1] - 0.01, num=nknots) # generate and fit the spline spl = LSQUnivariateSpline(phase, pmags, t=splineknots, w=1.0 / perrs) # calculate the spline fit to the actual phases, the chisq and red-chisq fitmags = spl(phase) fitchisq = npsum(((fitmags - pmags) * (fitmags - pmags)) / (perrs * perrs)) fitredchisq = fitchisq / (len(pmags) - nknots - 1) if verbose: LOGINFO('spline fit done. nknots = %s, ' 'chisq = %.5f, reduced chisq = %.5f' % (nknots, fitchisq, fitredchisq)) # figure out the time of light curve minimum (i.e. the fit epoch) # this is when the fit mag is maximum (i.e. the faintest) # or if magsarefluxes = True, then this is when fit flux is minimum if not magsarefluxes: fitmagminind = npwhere(fitmags == npmax(fitmags)) else: fitmagminind = npwhere(fitmags == npmin(fitmags)) magseriesepoch = ptimes[fitmagminind] # assemble the returndict returndict = { 'fittype': 'spline', 'fitinfo': { 'nknots': nknots, 'fitmags': fitmags, 'fitepoch': magseriesepoch }, 'fitchisq': fitchisq, 'fitredchisq': fitredchisq, 'fitplotfile': None, 'magseries': { 'times': ptimes, 'phase': phase, 'mags': pmags, 'errs': perrs, 'magsarefluxes': magsarefluxes }, } # make the fit plot if required if plotfit and isinstance(plotfit, str): _make_fit_plot(phase, pmags, perrs, fitmags, period, mintime, magseriesepoch, plotfit, magsarefluxes=magsarefluxes) returndict['fitplotfile'] = plotfit return returndict
def fourier_fit_magseries(times, mags, errs, period, fourierorder=None, fourierparams=None, sigclip=3.0, magsarefluxes=False, plotfit=False, ignoreinitfail=True, verbose=True): '''This fits a Fourier series to a magnitude time series. This uses an 8th-order Fourier series by default. This is good for light curves with many thousands of observations (HAT light curves have ~10k observations). Lower the order accordingly if you have fewer observations in your light curves to avoid over-fitting. Set the Fourier order by using either the fourierorder kwarg OR the fourierparams kwarg. If fourierorder is None, then fourierparams is a list of the form for fourier order = N: [fourier_amp1, fourier_amp2, fourier_amp3,...,fourier_ampN, fourier_phase1, fourier_phase2, fourier_phase3,...,fourier_phaseN] If both/neither are specified, the default Fourier order of 3 will be used. Returns the Fourier fit parameters, the minimum chisq and reduced chisq. Makes a plot for the fit to the mag series if plotfit is a string containing a filename to write the plot to. This folds the time series using the given period and at the first observation. Can optionally sigma-clip observations. if ignoreinitfail is True, ignores the initial failure to find a set of optimized Fourier parameters and proceeds to do a least-squares fit anyway. magsarefluxes is a boolean value for setting the ylabel and ylimits of plots for either magnitudes (False) or flux units (i.e. normalized to 1, in which case magsarefluxes should be set to True). ''' stimes, smags, serrs = sigclip_magseries(times, mags, errs, sigclip=sigclip, magsarefluxes=magsarefluxes) # get rid of zero errs nzind = npnonzero(serrs) stimes, smags, serrs = stimes[nzind], smags[nzind], serrs[nzind] phase, pmags, perrs, ptimes, mintime = (_get_phased_quantities( stimes, smags, serrs, period)) # get the fourier order either from the scalar order kwarg... if fourierorder and fourierorder > 0 and not fourierparams: fourieramps = [0.6] + [0.2] * (fourierorder - 1) fourierphas = [0.1] + [0.1] * (fourierorder - 1) fourierparams = fourieramps + fourierphas # or from the fully specified coeffs vector elif not fourierorder and fourierparams: fourierorder = int(len(fourierparams) / 2) else: LOGWARNING('specified both/neither Fourier order AND Fourier coeffs, ' 'using default Fourier order of 3') fourierorder = 3 fourieramps = [0.6] + [0.2] * (fourierorder - 1) fourierphas = [0.1] + [0.1] * (fourierorder - 1) fourierparams = fourieramps + fourierphas if verbose: LOGINFO('fitting Fourier series of order %s to ' 'mag series with %s observations, ' 'using period %.6f, folded at %.6f' % (fourierorder, len(phase), period, mintime)) # initial minimize call to find global minimum in chi-sq initialfit = spminimize(_fourier_chisq, fourierparams, method='BFGS', args=(phase, pmags, perrs)) # make sure this initial fit succeeds before proceeding if initialfit.success or ignoreinitfail: if verbose: LOGINFO('initial fit done, refining...') leastsqparams = initialfit.x try: leastsqfit = spleastsq(_fourier_residual, leastsqparams, args=(phase, pmags)) except Exception as e: leastsqfit = None # if the fit succeeded, then we can return the final parameters if leastsqfit and leastsqfit[-1] in (1, 2, 3, 4): finalparams = leastsqfit[0] # calculate the chisq and reduced chisq fitmags = _fourier_func(finalparams, phase, pmags) fitchisq = npsum( ((fitmags - pmags) * (fitmags - pmags)) / (perrs * perrs)) fitredchisq = fitchisq / (len(pmags) - len(finalparams) - 1) if verbose: LOGINFO('final fit done. chisq = %.5f, reduced chisq = %.5f' % (fitchisq, fitredchisq)) # figure out the time of light curve minimum (i.e. the fit epoch) # this is when the fit mag is maximum (i.e. the faintest) # or if magsarefluxes = True, then this is when fit flux is minimum if not magsarefluxes: fitmagminind = npwhere(fitmags == npmax(fitmags)) else: fitmagminind = npwhere(fitmags == npmin(fitmags)) magseriesepoch = ptimes[fitmagminind] # assemble the returndict returndict = { 'fittype': 'fourier', 'fitinfo': { 'fourierorder': fourierorder, 'finalparams': finalparams, 'initialfit': initialfit, 'leastsqfit': leastsqfit, 'fitmags': fitmags, 'fitepoch': magseriesepoch }, 'fitchisq': fitchisq, 'fitredchisq': fitredchisq, 'fitplotfile': None, 'magseries': { 'times': ptimes, 'phase': phase, 'mags': pmags, 'errs': perrs, 'magsarefluxes': magsarefluxes }, } # make the fit plot if required if plotfit and isinstance(plotfit, str): _make_fit_plot(phase, pmags, perrs, fitmags, period, mintime, magseriesepoch, plotfit, magsarefluxes=magsarefluxes) returndict['fitplotfile'] = plotfit return returndict # if the leastsq fit did not succeed, return Nothing else: LOGERROR( 'fourier-fit: least-squared fit to the light curve failed') return { 'fittype': 'fourier', 'fitinfo': { 'fourierorder': fourierorder, 'finalparams': None, 'initialfit': initialfit, 'leastsqfit': None, 'fitmags': None, 'fitepoch': None }, 'fitchisq': npnan, 'fitredchisq': npnan, 'fitplotfile': None, 'magseries': { 'times': ptimes, 'phase': phase, 'mags': pmags, 'errs': perrs, 'magsarefluxes': magsarefluxes } } # if the fit didn't succeed, we can't proceed else: LOGERROR('initial Fourier fit did not succeed, ' 'reason: %s, returning scipy OptimizeResult' % initialfit.message) return { 'fittype': 'fourier', 'fitinfo': { 'fourierorder': fourierorder, 'finalparams': None, 'initialfit': initialfit, 'leastsqfit': None, 'fitmags': None, 'fitepoch': None }, 'fitchisq': npnan, 'fitredchisq': npnan, 'fitplotfile': None, 'magseries': { 'times': ptimes, 'phase': phase, 'mags': pmags, 'errs': perrs, 'magsarefluxes': magsarefluxes } }
def GetProbTotChild(Hpm, Hpf, Hc, Gc, Readsc, error, recombination_rate, plog = True, Qscoresc = None, GenoConstraint=True): """Determine the set of distinct extensions of a child conditional on its extended parents, calculate their prior weights and report the read AND recombination support for each child extension. It is assumed that at least one of the parental genotypes is available at s.""" global transmit_parent_m global transmit_parent_f global child_haploid global transmit_parent probs = [] # the probability of SR(s) conditional on (Hp, H, eps) weights = [] # the prior pobability of Hp conditional on (H, eps, recombination_rate) Uniques = [] # distinct Hp's if GenoConstraint and set(Gc.GetGenes())=={'.'}: garbage = sys.stderr.write('WARNING: Child\'s genotype is missing at position {1:d}! Its phasing extension will be skipped at s={0:d}!\n'.format(Gc.GetS()+1, Gc.GetPos())) return Uniques, weights, probs biparental = False current_Gm = Hpm.GetGenotype(Hpm.GetStop()) # current maternal allele (after extension) current_Gf = Hpf.GetGenotype(Hpf.GetStop()) # current paternal allele last_Gc = Hc.GetGenotype(Hc.GetStop()) # last child allele (before extension) ploidy_m = len(current_Gm) # It is assumed that the first haploid part of the child homologues has maternal descent and the second haploid paternal ploidy_f = len(current_Gf) ploidy_c = (ploidy_m+ploidy_f)//2 last_determined_SNPpos_m = Hpm.GetStop()-1 # the last SNP position with determined alleles in the maternal haplotypes, the default value last_determined_SNPpos_f = Hpf.GetStop()-1 # the last SNP position with determined alleles in the paternal haplotypes, the default value if set(Gc.GetGenes())=={'.'}: #'-' stands for complete genotype missing, '.' stands for partial missing or null allele Gc = Genotype(Gc.GetS(), Gc.GetPos(), *['-' for _x in range(0, ploidy_c)]) if not transmit_parent_m: transmit_parent_m = list(itertools.combinations(range(0, ploidy_m), ploidy_m//2)) # Initialize transmit_parent_m if not transmit_parent_f: transmit_parent_f = list(itertools.combinations(range(0, ploidy_f), ploidy_f//2)) # Initialize transmit_parent_f if not child_haploid: child_haploid = list(itertools.combinations(range(0, ploidy_c), ploidy_c//2)) # Initialize child_haploid if not transmit_parent: transmit_parent = list(itertools.product(transmit_parent_m, transmit_parent_f)) # Initialize transmit_parent transmit_parent_G = [] attempt_num=1 OrigG = GenoConstraint while not transmit_parent_G and attempt_num<=2: if attempt_num>1: GenoConstraint = False if '-' not in set(current_Gm).union(set(current_Gf)): # if no parental genotype is missing, consider all of the possible IBD potentiae biparental = True if GenoConstraint: # only allow transmissions that are compatible with the current genotype of the child transmit_parent_G = [_x for _x in transmit_parent if Counter([current_Gm[_y] for _y in _x[0]]+[current_Gf[_z] for _z in _x[1]])==Counter(Gc.GetGenes())] else: transmit_parent_G = [_x for _x in transmit_parent] elif attempt_num == 1: # Can only occur if GenoConstraint has been originally True if '-' in set(current_Gm): # impute maternal alleles from the child and the father transmit_parent_G = [(tuple('-' for _i in range(0, ploidy_m//2)), _x) for _x in transmit_parent_f if any(Counter([current_Gf[_z] for _z in _x])==Counter([Gc.GetGenes()[_hh] for _hh in _h]) for _h in child_haploid)] else: # impute paternal alleles from the mother and the child transmit_parent_G = [(_x, tuple('-' for _i in range(0, ploidy_f//2))) for _x in transmit_parent_m if any(Counter([current_Gm[_y] for _y in _x])==Counter([Gc.GetGenes()[_hh] for _hh in _h]) for _h in child_haploid)] else: transmit_parent_G = [] attempt_num+=1 GenoConstraint = OrigG if attempt_num>2: error = min(0.999, error*1.1) recombination_rate = min(0.499, recombination_rate+0.005) if not transmit_parent_G: garbage = sys.stderr.write("WARNING: No extension of child haplotypes was compatible with the given parental and/or child genotypes assuming Mendelian inheritance! Child extension will be skipped at position {1:d}, s={0:d}!".format(Gc.GetS()+1, Gc.GetPos())) return Uniques, weights, probs #_return = GetProbTot(Hc, Gc, Readsc, error, True, Qscoresc)[:-1] #for _pro in range(0, len(_return[-1])): # if plog: # _return[-1][_pro]+=loge(0.1) # else: # _return[-1][_pro]*=0.1 #for _sol in range(0, len(_return[0])): # _return[0][_sol].SetMO(Hc.GetMO()) # _return[0][_sol].SetPO(Hc.GetPO()) #return _return for _t in transmit_parent_G: if biparental: Genotypes_to_pass = [[current_Gm[_y] for _y in _t[0]]+[current_Gf[_z] for _z in _t[1]]] # Genotype to pass will be the same as Gc.GetGenes() if GenoConstraint is True elif '-' in set(current_Gm): # Try to impute the missing parent's alleles from the child alleles and the alleles of the other parent fake_genotype_m = [_code for _code in Gc.GetGenes()] # If it is not possible, skip the extension of child at that position try: for _paternal_allele in (current_Gf[_z] for _z in _t[1]): fake_genotype_m.remove(_paternal_allele) Genotypes_to_pass = [fake_genotype_m + [current_Gf[_z] for _z in _t[1]]] # Genotype to pass will be the same as Gc.GetGenes() if GenoConstraint is True. Only the order is changed so that the alleles with maternal descent come first. except ValueError as e: garbage = sys.stderr.write("WARNING: "+e.args[0]+"\n") if 'list.remove(x): x not in list' in e.args[0]: Genotypes_to_pass = [] else: fake_genotype_f = [_code for _code in Gc.GetGenes()] try: for _maternal_allele in (current_Gm[_y] for _y in _t[0]): fake_genotype_f.remove(_maternal_allele) Genotypes_to_pass = [[current_Gm[_y] for _y in _t[0]] + fake_genotype_f] # Genotype to pass will be the same as Gc.GetGenes() if GenoConstraint is True except ValueError as e: garbage = sys.stderr.write("WARNING: "+e.args[0]+"\n") if 'list.remove(x): x not in list' in e.args[0]: Genotypes_to_pass = [] #print("$$$!\n", Genotypes_to_pass, fake_genotype_f,"$$$$$$$$$\n#############") for _Genotype_to_pass in Genotypes_to_pass: # Genotype_to_pass could have length >1 in case genoconstraint is False and a parental genotype is missing No_recom_info_m = bool(len(set(current_Gm))==1 or '-' in _t[0]) No_recom_info_f = bool(len(set(current_Gf))==1 or '-' in _t[1]) #print("Mother="+str(No_recom_info_m)) #print("Father="+str(No_recom_info_f)) P = Haplotypes(Gc.GetS(), Gc.GetS(), 0, loge(len(set(itertools.permutations(_Genotype_to_pass)))), None if No_recom_info_m else _t[0], None if No_recom_info_f else _t[1], *_Genotype_to_pass) #print("P="+str(P)) #print(repr(P)) Hpc = Hc + P #print("***********:\n",P, Hc, Hpc,'**************\n******************') _recombination_m = [] if (Hc.GetMO() is None or No_recom_info_m) else [0 if _origin == _new else 1 for _origin, _new in zip(Hc.GetMO(), _t[0])] # 0 if both the current allele and the preceding allele can be assigned to the same maternal descent, else 1. If the maternal descent if unknown, e.g. if the mother is homozygous, ignore maternal recombination. _recombination_f = [] if (Hc.GetPO() is None or No_recom_info_f) else [0 if _origin == _new else 1 for _origin, _new in zip(Hc.GetPO(), _t[1])] # 0 if both the current allele and the preceding allele can be assigned to the same paternal descent, else 1. If the paternal descent if unknown, e.g. if the father is homozygous, ignore paternal recombination. #if plog: # recombination_prob = sum([loge(recombination_rate) if _r==1 else loge(1-recombination_rate) for _r in (_recombination_m+_recombination_f)]) # Change the prior weights by penalizing recombination events with the cost -loge(recombination_rate) # _prob_Hc_reads_recombination = _prob_Hc_reads + recombination_prob #else: # recombination_prob = array([recombination_rate if _r==1 else (1-recombination_rate) for _r in (_recombination_m+_recombination_f)]).prod() # Change the prior weights by penalizing recombination events with the cost -loge(recombination_rate) # _prob_Hc_reads_recombination = _prob_Hc_reads * recombination_prob recombination_weight = array([recombination_rate if _r==1 else (1-recombination_rate) for _r in (_recombination_m+_recombination_f)]).prod() # Change the prior weights by penalizing recombination events with the cost -loge(recombination_rate) #print(_recombination_m, _recombination_f, recombination_weight) #print("Hc:"+str(Hc.GetMO())+'\t'+str(Hc.GetPO())) #print("Hpc:"+str(Hpc.GetMO())+'\t'+str(Hpc.GetPO())) #print("T:"+str(_t[0])+'\t'+str(_t[1])) if Hpc not in Uniques: Uniques.append(Hpc) probs.append(GetProbReads(Readsc, Hpc, error, plog, Qscoresc)) #probs.append(_prob_Hc_reads_recombination) #weights.append(1) _npVset = [] for _v in Hpc.GetVS(): _npv = array(_v) _npVset.append(npdel(_npv, npwhere(_npv=='-')).tolist()) try: weights.append(exp(GetLogProbH(Haplotypes(1, 2, 1, log(len(set(itertools.permutations(tuple((_v[-2],_v[-1]) for _v in _npVset))))), None, None, *[(_v[-2],_v[-1]) for _v in _npVset])))) except IndexError: weights.append(1) #weights.append(exp(GetLogProbH(Hpc))) weights[-1]*=recombination_weight elif _recombination_m or _recombination_f: #probs[Uniques.index(Hpc)] = max(probs[Uniques.index(Hpc)], _prob_Hc_reads_recombination) _npVset = [] for _v in Hpc.GetVS(): _npv = array(_v) _npVset.append(npdel(_npv, npwhere(_npv=='-')).tolist()) try: new_weight = exp(GetLogProbH(Haplotypes(1, 2, 1, log(len(set(itertools.permutations(tuple((_v[-2],_v[-1]) for _v in _npVset))))), None, None, *[(_v[-2],_v[-1]) for _v in _npVset])))*recombination_weight except IndexError: new_weight = recombination_weight _Hpindx = Uniques.index(Hpc) if weights[_Hpindx] < new_weight: weights[_Hpindx] = new_weight if not No_recom_info_m: Uniques[_Hpindx].SetMO(Hpc.GetMO()) if not No_recom_info_f: Uniques[_Hpindx].SetPO(Hpc.GetPO()) #print("Updated Hpc:"+str(Hpc.GetMO())+'\t'+str(Hpc.GetPO())) else: pass wsum = float(sum(weights)) weights = [_w/wsum for _w in weights] return Uniques, weights, probs
def process_vep(vep, debug): ''' Takes the full VEP dataframe and selects a single consequence per variant, order of preference: 1) Select canonical ENST when possible 2) Select canonical RefSeq when possible 3) Select ENST with unassigned canonical 4) Select Refseq with unassigned canonical 5) Select ENST specifically not canonical 6) Select Refseq specifically not canonical 7) Select XM entries left over 8) Select entries with uniprot assignments only from VEP Anything without a Uniprot assignment is filtered In the event of a tie, take highest transcript ID if AA and pos are the same, otherwise take highest pos returns a list of unique variants in the form of the typical variant input file Note: In rare cases, the same variant will hit multiple primary uniprot entries. In these cases, uniprot has identified sufficiently different isoforms to call them different proteins Therefore, repeats across different primary uniprot entries may occur. A minor (major?) drawback to the current approach: If there is a variant which affects 2 different uniprot entries and in one entry it hits the canonical transcript, while in the other it hits a non-canonical isoform, since the variant is filtered out before getting to the non-canonical isoforms, both wont be captured. One example is 1:g.156842168C>T which hits INSRR (one transcript) and NTRK1 isoform 3. Only INSRR is kept since it has 1 transcript and isoform 3 is not canonical. Another drawback that might need to be addressed: As it stands, if it has multiple potentials in the same step, it selects one based on max protein position and if still multiple max transcript ID. The problem with this is that you can have 2 variants assigned different transcripts even though it would probably be easier and better if they both had the same one. For example, vars A and B don't hit canonical. Var A is position 10, Var B is position 100. There is isoform name ENST1 with 100 residues and ENST2 with 98 residues. The missing residues are at position 11-12 Var A gets ENST2 since it's 10 in both but ENST2>ENST1 Var B gets ENST1 since it's at 100 in ENST1 and 98 in ENST2 It's probably preferable to put them both in ENST1. However, the current datasets (and uniprot_sprot_human.tab) don't have transcript length. So, the best solution (to assign a transcript to a uniprot at a time before selecting with var, and resolving overlaps by using longest transcript can't be done without pulling in another dataset. I'm not sure how often this happens and if it warrants the extra time required for this better solution. One potential solution that would also solve another drawback is to read in the sequence sets to get the transcript lengths. This would allow cases (no idea how common or rare) where the variant hits a transcript in the fasta set but is assigned a different transcript that is not in the sequences set but is, for example, canonical according to uniprot. In this case, a transcript present in the sequence set could be preferred. ''' debug_head = "DEBUG: IO: process_vep: " if debug: print debug_head + "processing raw df with {} rows".format( len(vep.index)) # canonical identifies the canonical transcript used by uniprot canonical = load_canonical(debug) # sec2prime allows filtering out any secondary uniprot ACs sec2prime = load_sec2prime(debug) # Attach the uniprot names if debug: print debug_head + "merging uniprot names to transcripts" vep = vep.merge(canonical, how="left", on=["Transcript", "Protein"]) try: if len(vep.index) == 0: raise ParseException("VEP file", "Failed to extract variants from VEP file") except ParseException as e: sys.exit(e.fullmsg) # Filter out anything that doesn't have a uniprot name # Keep those without a uniprot name for use in case they have one # assigned by VEP if debug: print debug_head + "extracting null uniprots into separate df" vep_nullunp = vep[vep.Uniprot.isnull()] vep = vep[vep.Uniprot.notnull()] nrows = len(vep.index) if debug: print debug_head + "result withunp {} rows; nullunp df {} rows".format( len(vep.index), len(vep_nullunp.index)) # Filter any secondary uniprot AC's if debug: print debug_head + "Removing secondary uniprots" secondary = [x[0] for x in sec2prime] vep = vep[~vep.Uniprot.isin(secondary)] if debug: print debug_head + "Result df {} rows".format(len(vep.index)) # Now select 1 consequence per variant # Note: There may be mulitple consequences if they map to # different uniprots (different proteins, same gene) def addvars(vep, vep_final, group): #append to final set and filter from initial vep_final = pd.concat([ vep_final, group.apply(lambda x: x.sort_values(["quickpos", "Transcript"], ascending=False).head(1)) ]) return vep[~vep.Varcode.isin(vep_final.Varcode)], vep_final vep_final = None #Need to add a quickposition column that is the int of the first # position for finding max position instance for repeats if debug: print debug_head + "Adding position column" vep["quickpos"] = vep.Protein_position.astype(str).str.extract( '(\d+)', expand=False).astype(int) vep_nullunp["quickpos"] = vep_nullunp.Protein_position.astype( str).str.extract('(\d+)', expand=False).astype(int) print "selecting unique variants" # Loops through steps 1-6: # Canonical = YES: ENST then NM then XM # Canonical = Unassigned: ENST then NM then XM # Canonical = NO: ENST then NM then XM # Typically, when a transcript is unassigned it means it is the only # one and therefore no isoform is designated. However, there is an edge # case where it is unassigned even though there are assignments # So, need to deal with these after and remove them for now if debug: print debug_head + "Separating unassigned transcripts" vep_unassigned = vep[(vep.Canonical=="Unassigned")\ & (pd.to_numeric(vep.nIsoforms)>0)] vep = vep[~((vep.Canonical=="Unassigned")\ & (pd.to_numeric(vep.nIsoforms)>0))] if debug: print debug_head + "Result: {} rows assigned; {} rows unassigned".format( len(vep.index), len(vep_unassigned.index)) for outer in ["YES", "Unassigned", "NO"]: for inner in ["ENST", "NM", "XM"]: if len(vep.index) == 0: break if len(vep[(vep.Canonical==outer)\ & (vep.Transcript.str.startswith(inner))].index)==0: continue print "{}: {}".format(outer, inner) group = vep[(vep.Canonical==outer)\ & (vep.Transcript.str.startswith(inner))]\ .groupby(["Varcode","Uniprot"]) if vep_final is None: vep_final = group.apply(lambda x: x.sort_values( ["quickpos", "Transcript"], ascending=False).head(1)) vep = vep[~vep.Varcode.isin(vep_final.Varcode)] else: vep, vep_final = addvars(vep, vep_final, group) if debug: print debug_head + "Selected {} rows; remaining {} rows".format( len(vep_final.index), len(vep.index)) # Deal with any of potential cases where an unassigned transcript is only hit # in a uniprot with multiple isoforms vep_unassigned = vep_unassigned[~vep_unassigned.Varcode.isin(vep_final. Varcode)] if debug: print debug_head + "processing remaining {} unassigned".format( len(vep_unassigned.index)) for inner in ["ENST", "NM", "XM"]: if len(vep_unassigned.index) == 0: break group = vep_unassigned[vep_unassigned.Transcript.str.startswith(inner)]\ .groupby(["Varcode","Uniprot"]) current_vars = group.apply(lambda x: x.sort_values( ["quickpos", "Transcript"], ascending=False).head(1)) vep_final = pd.concat([vep_final, current_vars]) vep_unassigned = vep_unassigned[~vep_unassigned.Varcode.isin(vep_final. Varcode)] if debug: print debug_head + "Selected {} rows; remaining unassigned {} rows".format( len(vep_final.index), len(vep_unassigned.index)) # Step 8: select any remaining variants that could not # be assigned a uniprot entry based on the read in uniprot data. # Only consider instances where VEP provided a uniprot entry. # In some cases, the VEP uniprot may not match the current uniprot # which is why only those in the sequence datasets provided to this # program were considered first and anything left over the uniprot is # taken from VEP vep_nullunp = vep_nullunp[~vep_nullunp.Varcode.isin(vep_final.Varcode)] if debug: print debug_head + "Processing {} rows without uniprot".format( len(vep_nullunp.index)) for outer in ["SWISSPROT", "TREMBL"]: for inner in ["ENST", "NM", "XM"]: if len(vep_nullunp.index) == 0: break print "{}: {}".format(outer, inner) group = vep_nullunp[(vep_nullunp[outer]!="-")\ & (vep_nullunp.Transcript.str.startswith(inner))]\ .groupby(["Varcode",outer]) current_vars = group.apply(lambda x: x.sort_values( ["quickpos", "Transcript"], ascending=False).head(1)) vep_final = pd.concat([vep_final, current_vars]) vep_final["Uniprot"] = npwhere(vep_final.Uniprot.isnull(), vep_final[outer], vep_final.Uniprot) vep_final["Isoform"] = npwhere(vep_final.Isoform.isnull(), vep_final[outer], vep_final.Isoform) vep_nullunp = vep_nullunp[~vep_nullunp.Varcode.isin(vep_final. Varcode)] if debug: print debug_head + "Selected {} rows: remaining nounp {} rows".format( len(vep_final.index), len(vep_nullunp.index)) # Finally, replace all "Unassigned" isoforms with the uniprot entry vep_final["Isoform"] = npwhere(vep_final.Isoform == "Unassigned", vep_final.Uniprot, vep_final.Isoform) print "{} unique variants with uniprot assignment selected".format( vep_final.Varcode.nunique()) # report any variants that have multiple assignments if vep_final.Varcode.nunique() < vep_final.shape[0]: print "The following variant codes have multiple uniprot assignments and will be treated as individual variants for all pairs:" dups = vep_final[vep_final.duplicated(subset='Varcode', keep=False)] print vep_final[vep_final.Varcode.isin( dups.Varcode)][['Varcode', 'Uniprot']].to_string(index=False) vep = pd.concat([vep, vep_nullunp, vep_unassigned]) if len(vep.index) > 0: print "The following {} unique coding variants were unable to be assigned:".format( vep.Varcode.nunique()) print ",".join(set(vep.Varcode)) vep_final.to_csv("outtmp.tab", sep="\t", header=True, index=False) return vep_final.drop_duplicates()
def spline_fit_magseries(times, mags, errs, period, knotfraction=0.01, maxknots=30, sigclip=30.0, plotfit=False, ignoreinitfail=False, magsarefluxes=False, verbose=True): '''This fits a univariate cubic spline to the phased light curve. This fit may be better than the Fourier fit for sharply variable objects, like EBs, so can be used to distinguish them from other types of variables. Parameters ---------- times,mags,errs : np.array The input mag/flux time-series to fit a spline to. period : float The period to use for the spline fit. knotfraction : float The knot fraction is the number of internal knots to use for the spline. A value of 0.01 (or 1%) of the total number of non-nan observations appears to work quite well, without over-fitting. maxknots controls the maximum number of knots that will be allowed. maxknots : int The maximum number of knots that will be used even if `knotfraction` gives a value to use larger than `maxknots`. This helps dealing with over-fitting to short time-scale variations. sigclip : float or int or sequence of two floats/ints or None If a single float or int, a symmetric sigma-clip will be performed using the number provided as the sigma-multiplier to cut out from the input time-series. If a list of two ints/floats is provided, the function will perform an 'asymmetric' sigma-clip. The first element in this list is the sigma value to use for fainter flux/mag values; the second element in this list is the sigma value to use for brighter flux/mag values. For example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma dimmings and greater than 3-sigma brightenings. Here the meaning of "dimming" and "brightening" is set by *physics* (not the magnitude system), which is why the `magsarefluxes` kwarg must be correctly set. If `sigclip` is None, no sigma-clipping will be performed, and the time-series (with non-finite elems removed) will be passed through to the output. magsarefluxes : bool If True, will treat the input values of `mags` as fluxes for purposes of plotting the fit and sig-clipping. plotfit : str or False If this is a string, this function will make a plot for the fit to the mag/flux time-series and writes the plot to the path specified here. ignoreinitfail : bool If this is True, ignores the initial failure to find a set of optimized Fourier parameters using the global optimization function and proceeds to do a least-squares fit anyway. verbose : bool If True, will indicate progress and warn of any problems. Returns ------- dict This function returns a dict containing the model fit parameters, the minimized chi-sq value and the reduced chi-sq value. The form of this dict is mostly standardized across all functions in this module:: { 'fittype':'spline', 'fitinfo':{ 'nknots': the number of knots used for the fit 'fitmags': the model fit mags, 'fitepoch': the epoch of minimum light for the fit, }, 'fitchisq': the minimized value of the fit's chi-sq, 'fitredchisq':the reduced chi-sq value, 'fitplotfile': the output fit plot if fitplot is not None, 'magseries':{ 'times':input times in phase order of the model, 'phase':the phases of the model mags, 'mags':input mags/fluxes in the phase order of the model, 'errs':errs in the phase order of the model, 'magsarefluxes':input value of magsarefluxes kwarg } } ''' # this is required to fit the spline correctly if errs is None: errs = npfull_like(mags, 0.005) # sigclip the magnitude time series stimes, smags, serrs = sigclip_magseries(times, mags, errs, sigclip=sigclip, magsarefluxes=magsarefluxes) # get rid of zero errs nzind = npnonzero(serrs) stimes, smags, serrs = stimes[nzind], smags[nzind], serrs[nzind] # phase the mag series phase, pmags, perrs, ptimes, mintime = ( get_phased_quantities(stimes, smags, serrs, period) ) # now figure out the number of knots up to max knots (=100) nobs = len(phase) nknots = int(npfloor(knotfraction*nobs)) nknots = maxknots if nknots > maxknots else nknots splineknots = nplinspace(phase[0] + 0.01, phase[-1] - 0.01, num=nknots) # NOTE: newer scipy needs x to be strictly increasing. this means we should # filter out anything that doesn't have np.diff(phase) > 0.0 # FIXME: this needs to be tested phase_diffs_ind = npdiff(phase) > 0.0 incphase_ind = npconcatenate((nparray([True]), phase_diffs_ind)) phase, pmags, perrs = (phase[incphase_ind], pmags[incphase_ind], perrs[incphase_ind]) # generate and fit the spline spl = LSQUnivariateSpline(phase, pmags, t=splineknots, w=1.0/perrs) # calculate the spline fit to the actual phases, the chisq and red-chisq fitmags = spl(phase) fitchisq = npsum( ((fitmags - pmags)*(fitmags - pmags)) / (perrs*perrs) ) fitredchisq = fitchisq/(len(pmags) - nknots - 1) if verbose: LOGINFO( 'spline fit done. nknots = %s, ' 'chisq = %.5f, reduced chisq = %.5f' % (nknots, fitchisq, fitredchisq) ) # figure out the time of light curve minimum (i.e. the fit epoch) # this is when the fit mag is maximum (i.e. the faintest) # or if magsarefluxes = True, then this is when fit flux is minimum if not magsarefluxes: fitmagminind = npwhere(fitmags == npmax(fitmags)) else: fitmagminind = npwhere(fitmags == npmin(fitmags)) if len(fitmagminind[0]) > 1: fitmagminind = (fitmagminind[0][0],) magseriesepoch = ptimes[fitmagminind] # assemble the returndict returndict = { 'fittype':'spline', 'fitinfo':{ 'nknots':nknots, 'fitmags':fitmags, 'fitepoch':magseriesepoch }, 'fitchisq':fitchisq, 'fitredchisq':fitredchisq, 'fitplotfile':None, 'magseries':{ 'times':ptimes, 'phase':phase, 'mags':pmags, 'errs':perrs, 'magsarefluxes':magsarefluxes }, } # make the fit plot if required if plotfit and isinstance(plotfit, str): make_fit_plot(phase, pmags, perrs, fitmags, period, mintime, magseriesepoch, plotfit, magsarefluxes=magsarefluxes) returndict['fitplotfile'] = plotfit return returndict
def legendre_fit_magseries(times, mags, errs, period, legendredeg=10, sigclip=30.0, plotfit=False, magsarefluxes=False, verbose=True): '''Fit an arbitrary-order Legendre series, via least squares, to the magnitude/flux time series. This is a series of the form:: p(x) = c_0*L_0(x) + c_1*L_1(x) + c_2*L_2(x) + ... + c_n*L_n(x) where L_i's are Legendre polynomials (also called "Legendre functions of the first kind") and c_i's are the coefficients being fit. This function is mainly just a wrapper to `numpy.polynomial.legendre.Legendre.fit`. Parameters ---------- times,mags,errs : np.array The input mag/flux time-series to fit a Legendre series polynomial to. period : float The period to use for the Legendre fit. legendredeg : int This is `n` in the equation above, e.g. if you give `n=5`, you will get 6 coefficients. This number should be much less than the number of data points you are fitting. sigclip : float or int or sequence of two floats/ints or None If a single float or int, a symmetric sigma-clip will be performed using the number provided as the sigma-multiplier to cut out from the input time-series. If a list of two ints/floats is provided, the function will perform an 'asymmetric' sigma-clip. The first element in this list is the sigma value to use for fainter flux/mag values; the second element in this list is the sigma value to use for brighter flux/mag values. For example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma dimmings and greater than 3-sigma brightenings. Here the meaning of "dimming" and "brightening" is set by *physics* (not the magnitude system), which is why the `magsarefluxes` kwarg must be correctly set. If `sigclip` is None, no sigma-clipping will be performed, and the time-series (with non-finite elems removed) will be passed through to the output. magsarefluxes : bool If True, will treat the input values of `mags` as fluxes for purposes of plotting the fit and sig-clipping. plotfit : str or False If this is a string, this function will make a plot for the fit to the mag/flux time-series and writes the plot to the path specified here. ignoreinitfail : bool If this is True, ignores the initial failure to find a set of optimized Fourier parameters using the global optimization function and proceeds to do a least-squares fit anyway. verbose : bool If True, will indicate progress and warn of any problems. Returns ------- dict This function returns a dict containing the model fit parameters, the minimized chi-sq value and the reduced chi-sq value. The form of this dict is mostly standardized across all functions in this module:: { 'fittype':'legendre', 'fitinfo':{ 'legendredeg': the Legendre polynomial degree used, 'fitmags': the model fit mags, 'fitepoch': the epoch of minimum light for the fit, }, 'fitchisq': the minimized value of the fit's chi-sq, 'fitredchisq':the reduced chi-sq value, 'fitplotfile': the output fit plot if fitplot is not None, 'magseries':{ 'times':input times in phase order of the model, 'phase':the phases of the model mags, 'mags':input mags/fluxes in the phase order of the model, 'errs':errs in the phase order of the model, 'magsarefluxes':input value of magsarefluxes kwarg } } ''' stimes, smags, serrs = sigclip_magseries(times, mags, errs, sigclip=sigclip, magsarefluxes=magsarefluxes) # get rid of zero errs nzind = npnonzero(serrs) stimes, smags, serrs = stimes[nzind], smags[nzind], serrs[nzind] phase, pmags, perrs, ptimes, mintime = ( get_phased_quantities(stimes, smags, serrs, period) ) if verbose: LOGINFO('fitting Legendre series with ' 'maximum Legendre polynomial order %s to ' 'mag series with %s observations, ' 'using period %.6f, folded at %.6f' % (legendredeg, len(pmags), period, mintime)) # Least squares fit of Legendre polynomial series to the data. The window # and domain (see "Using the Convenience Classes" in the numpy # documentation) are handled automatically, scaling the times to a minimal # domain in [-1,1], in which Legendre polynomials are a complete basis. p = Legendre.fit(phase, pmags, legendredeg) coeffs = p.coef fitmags = p(phase) # Now compute the chisq and red-chisq. fitchisq = npsum( ((fitmags - pmags)*(fitmags - pmags)) / (perrs*perrs) ) nparams = legendredeg + 1 fitredchisq = fitchisq/(len(pmags) - nparams - 1) if verbose: LOGINFO( 'Legendre fit done. chisq = %.5f, reduced chisq = %.5f' % (fitchisq, fitredchisq) ) # figure out the time of light curve minimum (i.e. the fit epoch) # this is when the fit mag is maximum (i.e. the faintest) # or if magsarefluxes = True, then this is when fit flux is minimum if not magsarefluxes: fitmagminind = npwhere(fitmags == npmax(fitmags)) else: fitmagminind = npwhere(fitmags == npmin(fitmags)) if len(fitmagminind[0]) > 1: fitmagminind = (fitmagminind[0][0],) magseriesepoch = ptimes[fitmagminind] # assemble the returndict returndict = { 'fittype':'legendre', 'fitinfo':{ 'legendredeg':legendredeg, 'fitmags':fitmags, 'fitepoch':magseriesepoch, 'finalparams':coeffs, }, 'fitchisq':fitchisq, 'fitredchisq':fitredchisq, 'fitplotfile':None, 'magseries':{ 'times':ptimes, 'phase':phase, 'mags':pmags, 'errs':perrs, 'magsarefluxes':magsarefluxes } } # make the fit plot if required if plotfit and isinstance(plotfit, str): make_fit_plot(phase, pmags, perrs, fitmags, period, mintime, magseriesepoch, plotfit, magsarefluxes=magsarefluxes) returndict['fitplotfile'] = plotfit return returndict
def savgol_fit_magseries(times, mags, errs, period, windowlength=None, polydeg=2, sigclip=30.0, plotfit=False, magsarefluxes=False, verbose=True): '''Fit a Savitzky-Golay filter to the magnitude/flux time series. SG fits successive sub-sets (windows) of adjacent data points with a low-order polynomial via least squares. At each point (magnitude), it returns the value of the polynomial at that magnitude's time. This is made significantly cheaper than *actually* performing least squares for each window through linear algebra tricks that are possible when specifying the window size and polynomial order beforehand. Numerical Recipes Ch 14.8 gives an overview, Eq. 14.8.6 is what Scipy has implemented. The idea behind Savitzky-Golay is to preserve higher moments (>=2) of the input data series than would be done by a simple moving window average. Note that the filter assumes evenly spaced data, which magnitude time series are not. By *pretending* the data points are evenly spaced, we introduce an additional noise source in the function values. This is a relatively small noise source provided that the changes in the magnitude values across the full width of the N=windowlength point window is < sqrt(N/2) times the measurement noise on a single point. TODO: - Find correct dof for reduced chi squared in savgol_fit_magseries Parameters ---------- times,mags,errs : np.array The input mag/flux time-series to fit the Savitsky-Golay model to. period : float The period to use for the model fit. windowlength : None or int The length of the filter window (the number of coefficients). Must be either positive and odd, or None. (The window is the number of points to the left, and to the right, of whatever point is having a polynomial fit to it locally). Bigger windows at fixed polynomial order risk lowering the amplitude of sharp features. If None, this routine (arbitrarily) sets the `windowlength` for phased LCs to be either the number of finite data points divided by 300, or polydeg+3, whichever is bigger. polydeg : int This is the order of the polynomial used to fit the samples. Must be less than `windowlength`. "Higher-order filters do better at preserving feature heights and widths, but do less smoothing on broader features." (Numerical Recipes). sigclip : float or int or sequence of two floats/ints or None If a single float or int, a symmetric sigma-clip will be performed using the number provided as the sigma-multiplier to cut out from the input time-series. If a list of two ints/floats is provided, the function will perform an 'asymmetric' sigma-clip. The first element in this list is the sigma value to use for fainter flux/mag values; the second element in this list is the sigma value to use for brighter flux/mag values. For example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma dimmings and greater than 3-sigma brightenings. Here the meaning of "dimming" and "brightening" is set by *physics* (not the magnitude system), which is why the `magsarefluxes` kwarg must be correctly set. If `sigclip` is None, no sigma-clipping will be performed, and the time-series (with non-finite elems removed) will be passed through to the output. magsarefluxes : bool If True, will treat the input values of `mags` as fluxes for purposes of plotting the fit and sig-clipping. plotfit : str or False If this is a string, this function will make a plot for the fit to the mag/flux time-series and writes the plot to the path specified here. ignoreinitfail : bool If this is True, ignores the initial failure to find a set of optimized Fourier parameters using the global optimization function and proceeds to do a least-squares fit anyway. verbose : bool If True, will indicate progress and warn of any problems. Returns ------- dict This function returns a dict containing the model fit parameters, the minimized chi-sq value and the reduced chi-sq value. The form of this dict is mostly standardized across all functions in this module:: { 'fittype':'savgol', 'fitinfo':{ 'windowlength': the window length used for the fit, 'polydeg':the polynomial degree used for the fit, 'fitmags': the model fit mags, 'fitepoch': the epoch of minimum light for the fit, }, 'fitchisq': the minimized value of the fit's chi-sq, 'fitredchisq':the reduced chi-sq value, 'fitplotfile': the output fit plot if fitplot is not None, 'magseries':{ 'times':input times in phase order of the model, 'phase':the phases of the model mags, 'mags':input mags/fluxes in the phase order of the model, 'errs':errs in the phase order of the model, 'magsarefluxes':input value of magsarefluxes kwarg } } ''' stimes, smags, serrs = sigclip_magseries(times, mags, errs, sigclip=sigclip, magsarefluxes=magsarefluxes) # get rid of zero errs nzind = npnonzero(serrs) stimes, smags, serrs = stimes[nzind], smags[nzind], serrs[nzind] phase, pmags, perrs, ptimes, mintime = ( get_phased_quantities(stimes, smags, serrs, period) ) if not isinstance(windowlength, int): windowlength = max( polydeg + 3, int(len(phase)/300) ) if windowlength % 2 == 0: windowlength += 1 if verbose: LOGINFO('applying Savitzky-Golay filter with ' 'window length %s and polynomial degree %s to ' 'mag series with %s observations, ' 'using period %.6f, folded at %.6f' % (windowlength, polydeg, len(pmags), period, mintime)) # generate the function values obtained by applying the SG filter. The # "wrap" option is best for phase-folded LCs. sgf = savgol_filter(pmags, windowlength, polydeg, mode='wrap') # here the "fit" to the phases is the function produced by the # Savitzky-Golay filter. then compute the chisq and red-chisq. fitmags = sgf fitchisq = npsum( ((fitmags - pmags)*(fitmags - pmags)) / (perrs*perrs) ) # TODO: quantify dof for SG filter. nparams = int(len(pmags)/windowlength) * polydeg fitredchisq = fitchisq/(len(pmags) - nparams - 1) fitredchisq = -99. if verbose: LOGINFO( 'SG filter applied. chisq = %.5f, reduced chisq = %.5f' % (fitchisq, fitredchisq) ) # figure out the time of light curve minimum (i.e. the fit epoch) # this is when the fit mag is maximum (i.e. the faintest) # or if magsarefluxes = True, then this is when fit flux is minimum if not magsarefluxes: fitmagminind = npwhere(fitmags == npmax(fitmags)) else: fitmagminind = npwhere(fitmags == npmin(fitmags)) if len(fitmagminind[0]) > 1: fitmagminind = (fitmagminind[0][0],) magseriesepoch = ptimes[fitmagminind] # assemble the returndict returndict = { 'fittype':'savgol', 'fitinfo':{ 'windowlength':windowlength, 'polydeg':polydeg, 'fitmags':fitmags, 'fitepoch':magseriesepoch }, 'fitchisq':fitchisq, 'fitredchisq':fitredchisq, 'fitplotfile':None, 'magseries':{ 'times':ptimes, 'phase':phase, 'mags':pmags, 'errs':perrs, 'magsarefluxes':magsarefluxes } } # make the fit plot if required if plotfit and isinstance(plotfit, str): make_fit_plot(phase, pmags, perrs, fitmags, period, mintime, magseriesepoch, plotfit, magsarefluxes=magsarefluxes) returndict['fitplotfile'] = plotfit return returndict
def fourier_fit_magseries( times, mags, errs, period, fourierorder=None, fourierparams=None, fix_period=True, scale_errs_redchisq_unity=True, sigclip=3.0, magsarefluxes=False, plotfit=False, ignoreinitfail=True, verbose=True, curve_fit_kwargs=None, ): '''This fits a Fourier series to a mag/flux time series. Parameters ---------- times,mags,errs : np.array The input mag/flux time-series to fit a Fourier cosine series to. period : float The period to use for the Fourier fit. fourierorder : None or int If this is an int, will be interpreted as the Fourier order of the series to fit to the input mag/flux times-series. If this is None and `fourierparams` is specified, `fourierparams` will be used directly to generate the fit Fourier series. If `fourierparams` is also None, this function will try to fit a Fourier cosine series of order 3 to the mag/flux time-series. fourierparams : list of floats or None If this is specified as a list of floats, it must be of the form below:: [fourier_amp1, fourier_amp2, fourier_amp3,...,fourier_ampN, fourier_phase1, fourier_phase2, fourier_phase3,...,fourier_phaseN] to specify a Fourier cosine series of order N. If this is None and `fourierorder` is specified, the Fourier order specified there will be used to construct the Fourier cosine series used to fit the input mag/flux time-series. If both are None, this function will try to fit a Fourier cosine series of order 3 to the input mag/flux time-series. fix_period : bool If True, will fix the period with fitting the sinusoidal function to the phased light curve. scale_errs_redchisq_unity : bool If True, the standard errors on the fit parameters will be scaled to make the reduced chi-sq = 1.0. This sets the ``absolute_sigma`` kwarg for the ``scipy.optimize.curve_fit`` function to False. sigclip : float or int or sequence of two floats/ints or None If a single float or int, a symmetric sigma-clip will be performed using the number provided as the sigma-multiplier to cut out from the input time-series. If a list of two ints/floats is provided, the function will perform an 'asymmetric' sigma-clip. The first element in this list is the sigma value to use for fainter flux/mag values; the second element in this list is the sigma value to use for brighter flux/mag values. For example, `sigclip=[10., 3.]`, will sigclip out greater than 10-sigma dimmings and greater than 3-sigma brightenings. Here the meaning of "dimming" and "brightening" is set by *physics* (not the magnitude system), which is why the `magsarefluxes` kwarg must be correctly set. If `sigclip` is None, no sigma-clipping will be performed, and the time-series (with non-finite elems removed) will be passed through to the output. magsarefluxes : bool If True, will treat the input values of `mags` as fluxes for purposes of plotting the fit and sig-clipping. plotfit : str or False If this is a string, this function will make a plot for the fit to the mag/flux time-series and writes the plot to the path specified here. ignoreinitfail : bool If this is True, ignores the initial failure to find a set of optimized Fourier parameters using the global optimization function and proceeds to do a least-squares fit anyway. verbose : bool If True, will indicate progress and warn of any problems. curve_fit_kwargs : dict or None If not None, this should be a dict containing extra kwargs to pass to the scipy.optimize.curve_fit function. Returns ------- dict This function returns a dict containing the model fit parameters, the minimized chi-sq value and the reduced chi-sq value. The form of this dict is mostly standardized across all functions in this module:: { 'fittype':'fourier', 'fitinfo':{ 'finalparams': the list of final model fit params, 'finalparamerrs': list of errs for each model fit param, 'fitmags': the model fit mags, 'fitperiod': the fit period if this wasn't set to fixed, 'fitepoch': this is times.min() for this fit type, 'actual_fitepoch': time of minimum light from fit model ... other fit function specific keys ... }, 'fitchisq': the minimized value of the fit's chi-sq, 'fitredchisq':the reduced chi-sq value, 'fitplotfile': the output fit plot if fitplot is not None, 'magseries':{ 'times':input times in phase order of the model, 'phase':the phases of the model mags, 'mags':input mags/fluxes in the phase order of the model, 'errs':errs in the phase order of the model, 'magsarefluxes':input value of magsarefluxes kwarg } } NOTE: the returned value of 'fitepoch' in the 'fitinfo' dict returned by this function is the time value of the first observation since this is where the LC is folded for the fit procedure. To get the actual time of minimum epoch as calculated by a spline fit to the phased LC, use the key 'actual_fitepoch' in the 'fitinfo' dict. ''' stimes, smags, serrs = sigclip_magseries(times, mags, errs, sigclip=sigclip, magsarefluxes=magsarefluxes) # get rid of zero errs nzind = npnonzero(serrs) stimes, smags, serrs = stimes[nzind], smags[nzind], serrs[nzind] phase, pmags, perrs, ptimes, mintime = (get_phased_quantities( stimes, smags, serrs, period)) # get the fourier order either from the scalar order kwarg... if fourierorder and fourierorder > 0 and not fourierparams: fourieramps = [0.6] + [0.2] * (fourierorder - 1) fourierphas = [0.1] + [0.1] * (fourierorder - 1) fourierparams = fourieramps + fourierphas # or from the fully specified coeffs vector elif not fourierorder and fourierparams: fourierorder = int(len(fourierparams) / 2) else: LOGWARNING('specified both/neither Fourier order AND Fourier coeffs, ' 'using default Fourier order of 3') fourierorder = 3 fourieramps = [0.6] + [0.2] * (fourierorder - 1) fourierphas = [0.1] + [0.1] * (fourierorder - 1) fourierparams = fourieramps + fourierphas if verbose: LOGINFO('fitting Fourier series of order %s to ' 'mag series with %s observations, ' 'using period %.6f, folded at %.6f' % (fourierorder, len(phase), period, mintime)) # initial minimize call to find global minimum in chi-sq initialfit = spminimize(_fourier_chisq, fourierparams, args=(phase, pmags, perrs)) # make sure this initial fit succeeds before proceeding if initialfit.success or ignoreinitfail: if verbose: LOGINFO('initial fit done, refining...') leastsqparams = initialfit.x try: curvefit_params = npconcatenate((nparray([period]), leastsqparams)) # set up the bounds for the fit parameters if fix_period: curvefit_bounds = ([period - 1.0e-7] + [-npinf] * fourierorder + [-npinf] * fourierorder, [period + 1.0e-7] + [npinf] * fourierorder + [npinf] * fourierorder) else: curvefit_bounds = ([0.0] + [-npinf] * fourierorder + [-npinf] * fourierorder, [npinf] + [npinf] * fourierorder + [npinf] * fourierorder) curvefit_func = partial( sinusoidal.fourier_curvefit_func, zerolevel=npmedian(smags), epoch=mintime, fixed_period=period if fix_period else None, ) if curve_fit_kwargs is not None: finalparams, covmatrix = curve_fit( curvefit_func, stimes, smags, p0=curvefit_params, sigma=serrs, bounds=curvefit_bounds, absolute_sigma=(not scale_errs_redchisq_unity), **curve_fit_kwargs) else: finalparams, covmatrix = curve_fit( curvefit_func, stimes, smags, p0=curvefit_params, sigma=serrs, bounds=curvefit_bounds, absolute_sigma=(not scale_errs_redchisq_unity), ) except Exception: LOGEXCEPTION("curve_fit returned an exception") finalparams, covmatrix = None, None # if the fit succeeded, then we can return the final parameters if finalparams is not None and covmatrix is not None: # this is the fit period fperiod = finalparams[0] phase, pmags, perrs, ptimes, mintime = (get_phased_quantities( stimes, smags, serrs, fperiod)) # calculate the chisq and reduced chisq fitmags = _fourier_func(finalparams[1:], phase, pmags) fitchisq = npsum( ((fitmags - pmags) * (fitmags - pmags)) / (perrs * perrs)) n_free_params = len(pmags) - len(finalparams) if fix_period: n_free_params -= 1 fitredchisq = fitchisq / n_free_params stderrs = npsqrt(npdiag(covmatrix)) if verbose: LOGINFO('final fit done. chisq = %.5f, reduced chisq = %.5f' % (fitchisq, fitredchisq)) # figure out the time of light curve minimum (i.e. the fit epoch) # this is when the fit mag is maximum (i.e. the faintest) # or if magsarefluxes = True, then this is when fit flux is minimum if not magsarefluxes: fitmagminind = npwhere(fitmags == npmax(fitmags)) else: fitmagminind = npwhere(fitmags == npmin(fitmags)) if len(fitmagminind[0]) > 1: fitmagminind = (fitmagminind[0][0], ) # assemble the returndict returndict = { 'fittype': 'fourier', 'fitinfo': { 'fourierorder': fourierorder, # return coeffs only for backwards compatibility with # existing functions that use the returned value of # fourier_fit_magseries 'finalparams': finalparams[1:], 'finalparamerrs': stderrs, 'initialfit': initialfit, 'fitmags': fitmags, 'fitperiod': finalparams[0], # the 'fitepoch' is just the minimum time here 'fitepoch': mintime, # the actual fit epoch is calculated as the time of minimum # light OF the fit model light curve 'actual_fitepoch': ptimes[fitmagminind] }, 'fitchisq': fitchisq, 'fitredchisq': fitredchisq, 'fitplotfile': None, 'magseries': { 'times': ptimes, 'phase': phase, 'mags': pmags, 'errs': perrs, 'magsarefluxes': magsarefluxes }, } # make the fit plot if required if plotfit and isinstance(plotfit, str): make_fit_plot(phase, pmags, perrs, fitmags, fperiod, mintime, mintime, plotfit, magsarefluxes=magsarefluxes) returndict['fitplotfile'] = plotfit return returndict # if the leastsq fit did not succeed, return Nothing else: LOGERROR( 'fourier-fit: least-squared fit to the light curve failed') return { 'fittype': 'fourier', 'fitinfo': { 'fourierorder': fourierorder, 'finalparams': None, 'finalparamerrs': None, 'initialfit': initialfit, 'fitmags': None, 'fitperiod': None, 'fitepoch': None, 'actual_fitepoch': None, }, 'fitchisq': npnan, 'fitredchisq': npnan, 'fitplotfile': None, 'magseries': { 'times': ptimes, 'phase': phase, 'mags': pmags, 'errs': perrs, 'magsarefluxes': magsarefluxes } } # if the fit didn't succeed, we can't proceed else: LOGERROR('initial Fourier fit did not succeed, ' 'reason: %s, returning scipy OptimizeResult' % initialfit.message) return { 'fittype': 'fourier', 'fitinfo': { 'fourierorder': fourierorder, 'finalparams': None, 'finalparamerrs': None, 'initialfit': initialfit, 'fitmags': None, 'fitperiod': None, 'fitepoch': None, 'actual_fitepoch': None, }, 'fitchisq': npnan, 'fitredchisq': npnan, 'fitplotfile': None, 'magseries': { 'times': ptimes, 'phase': phase, 'mags': pmags, 'errs': perrs, 'magsarefluxes': magsarefluxes } }
def add_pkg_to_local_db(new_row, local_db): """ Adds a package entry to the local database file Parameters ---------- new_row : ``astropy.table.Row`` The package entry data local_db : str, ``astropy.Table`` Eith Returns ------- """ if isinstance(local_db, str): local_table = get_local_packages(local_db) elif isinstance(local_db, Table): local_table = local_db else: raise ValueError("local_db must be either Table or path to DB file") if type(new_row) != Row: raise ValueError("pkg_entry must be an astropy.table.Row object") if new_row["name"] in local_table["name"]: ii = npwhere(local_table["name"] == new_row["name"])[0][0] fmt = '%Y-%m-%d' local_date = dt.datetime.strptime(local_table[ii]["date_modified"], fmt) new_date = dt.datetime.strptime(new_row["date_modified"], fmt) if new_date >= local_date: dic = {col: local_table[ii][col] for col in local_table.colnames} dic["name"] = dic["name"] + "_" + dic["date_modified"] new_tbl = Table(names=[col for col in dic], data=[[dic[col]] for col in dic]) new_tbl_2 = Table(names=[col for col in new_row.colnames], data=[[new_row[col]] for col in new_row.colnames]) tbl_to_add = vstack([new_tbl, new_tbl_2]) local_table.remove_row(ii) else: dic = {col: new_row[col] for col in new_row.colnames} dic["name"] = dic["name"] + "_" + dic["date_modified"] tbl_to_add = Table(names=[col for col in dic], data=[[dic[col]] for col in dic]) else: tbl_to_add = Table(names=[col for col in new_row.colnames], data=[[new_row[col]] for col in new_row.colnames]) new_local_table = vstack([local_table, tbl_to_add]) new_local_table.meta = local_table.meta return new_local_table
def GetProbTot_Founders(H, G, G_offspring, ReadsLST, ploidy_levels, error_rate, plog=False, QscoresLST=None, usecounts=False, Impute_Incompatible=True, Impute_Missing=True, redose=False): """Determine the set of distinct extensions for a pair of founder base haplotypes H=(Hm, Hf) to position s using their genotypes at s G=(Gm, Gf), calculate their prior weights and report the read-probabilities conditional on each extension. In case usecounts is True, also report the number of observed reads compatible with each homologue, so that the upstream functions may set a threshold on the minimum number of reads compatible with each homologue.""" global genotype_err Returned_Imputation = None done, attempt = False, 1 Imputation = [ Genotype(_G.GetS(), _G.GetPos(), *_G.GetGenes()) for _G in G ] # Parental genotypes are needed at s to extend H Imputation = Imputation + [ [Genotype(_Gc.GetS(), _Gc.GetPos(), *_Gc.GetGenes()) for _Gc in _Glst] for _Glst in G_offspring ] # Offspring genotypes are needed from SNP 1 to s to check compatiblity with H. Nevertheless, only the SNPs at s-1 and s are currently used to check compatiblity. Impute_Missing = Impute_Missing and ( any([_Gp.isMISSING() for _Gp in G]) or any([_Glst[-1].isMISSING() for _Glst in G_offspring]) ) # if no genotype is missing, Impute_Missing can be set to False if redose or Impute_Missing: # if redose, reassign all of the genotypes. Otherwise, if Impute_Missing is True and missing genotypes exist at s (for some of the parents or the offspring), try to impute them! if not redose: garbage = sys.stderr.write( 'WARNING: Missing offspring genotypes will be imputed at s={0:d}, position {1:d}!\n' .format(G[0].GetS() + 1, G[0].GetPos())) try: if not redose: _Imputations = ImputeGenotype( G[0].GetS(), ReadsLST, ploidy_levels, error_rate, QscoresLST, FixedGenos=Imputation[0:2] + [_Glst[-1] for _Glst in Imputation[2:]]) else: _Imputations = ImputeGenotype(G[0].GetS(), ReadsLST, ploidy_levels, error_rate, QscoresLST, FixedGenos=None) except BlockException as e: sys.stderr.write( 'WARNING: ' + ''.join(e.args) + " Failed to {2:s} at SNP {0:d}, position {1:d}!\n".format( G[0].GetS() + 1, G[0].GetPos(), "estimate dosages" if redose else "impute missing genotypes")) if redose: _Imputations = [ ('-', ) * ploidy_levels[0], ('-', ) * ploidy_levels[1] ] + [ tuple('-' for _x in range(0, sum(ploidy_levels) // 2)) for _Glst in G_offspring ] else: _Imputations = [_G.GetGenes() for _G in G] + [ tuple('-' for _x in range(0, sum(ploidy_levels) // 2)) if _Glst[-1].isMISSING() else _Glst[-1].GetGenes() for _Glst in G_offspring ] except: raise Imputation = [ Genotype(G[0].GetS(), G[0].GetPos(), *_alleles) for _alleles in _Imputations[0:2] ] + [[ Genotype(_Gc.GetS(), _Gc.GetPos(), *_Gc.GetGenes()) for _Gc in _Glst[:-1] ] + [Genotype(_Glst[-1].GetS(), _Glst[-1].GetPos(), *_alleles)] for _Glst, _alleles in zip(G_offspring, _Imputations[2:])] Returned_Imputation = [ Genotype(_Gp.GetS(), _Gp.GetPos(), *_alleles) for _Gp, _alleles in zip(G, _Imputations[0:2]) ] + [ Genotype(_Glst[-1].GetS(), _Glst[-1].GetPos(), *_alleles) for _Glst, _alleles in zip(G_offspring, _Imputations[2:]) ] # This will be passed to the caller function to show imputationhas occured while not done and attempt <= 2: perm = [ makePermutation(_G) for _G in Imputation[0:2] ] # (distinct) permutations of maternal and paternal genoptypes probs = [ ] # the probability of Semi Reads at s conditional on (Hp, H, eps) weights = [] # the prior pobability of Hp conditional on (H, eps) Uniques = [] # distinct Hp's Counts = [] # Number of reads compatible with each homologue for P1 in perm[ 0]: # evaluate all of the possible extensions for the base haplotypes of each parent for P2 in perm[1]: Hp = (H[0] + P1, H[1] + P2) if Hp not in Uniques: Uniques.append(Hp) if usecounts: _prob, _counts = GetProbReads_Founders( ReadsLST, Hp, error_rate, plog, QscoresLST, True) probs.append(_prob) Counts.append(_counts) else: probs.append( GetProbReads_Founders(ReadsLST, Hp, error_rate, plog, QscoresLST)) Counts.append(None) ploidies = [len(Hp[0].GetVS()), len(Hp[1].GetVS())] Candid_Offspring_Extensions_Hp = [] for _megagamete in Gametogenesis( Haplotypes( Hp[0].GetStop() - 1, Hp[0].GetStop(), 0, 0, None, None, *[ tuple(_h[len(_h) - 2:len(_h)]) for _h in Hp[0].GetVS() ]) ): # obtain and store all of the possible offspring haplotypes from the parents assuming no recombination for _microgamete in Gametogenesis( Haplotypes( Hp[1].GetStop() - 1, Hp[1].GetStop(), 0, 0, None, None, *[ tuple(_h[len(_h) - 2:len(_h)]) for _h in Hp[1].GetVS() ])): Candid_Offspring_Extensions_Hp.append( Haplotypes(Hp[0].GetStop() - 1, Hp[0].GetStop(), 0, 0, None, None, *(_megagamete + _microgamete))) _prior = 0 # the prior weight of a candidate founder extension #_prior = 1 # the prior weight of a candidate founder extension total_offspring_phasings_possible = len( Candid_Offspring_Extensions_Hp) for _id in range(2, len(Imputation)): #_min_number_of_errors = 2 # Mininum number of genotype incompatibilities for each offspring at s-1 and s, assumign a candidate parental extension Hp (min = 0 and max = 2, naturally!) _number_of_compatible_phasings = 0 for _Hc in Candid_Offspring_Extensions_Hp: #_error_Hc = Check_Genotype_Compatibility(_Hc, Imputation[_id], 0, give_number_of_incompatibles=True) #if _error_Hc < _min_number_of_errors: # _min_number_of_errors = _error_Hc if Check_Genotype_Compatibility( _Hc, Imputation[_id], 0): _number_of_compatible_phasings += 1 else: pass _prior += (_number_of_compatible_phasings * _number_of_compatible_phasings) #print(_prior) #_prior*=misc.comb(2,_min_number_of_errors)*genotype_err**_min_number_of_errors*(1-genotype_err)**(2-_min_number_of_errors) #weights.append(_prior/(1e-60+len(Candid_Offspring_Extensions_Hp)*(len(Imputation)-2))) # P(Hm,Hf|ReadLST)=P(ReadLST|Hm,Hf)P(Hm,Hf)=P(ReadsLST|Hm,Hf)P(Offspring Genotypes|Hm, Hf) #if weights[-1]>1e-10: # weights[-1]=1 weights.append( float(_prior) / (total_offspring_phasings_possible * total_offspring_phasings_possible)) for _Hp in Hp: _npVset = [] for _v in _Hp.GetVS(): _npv = array(_v) _npVset.append( npdel(_npv, npwhere(_npv == '-')).tolist()) try: weights[-1] *= (2**GetLogProbH( Haplotypes( 1, 2, 1, loge( len( set( itertools.permutations( tuple( (_v[-2], _v[-1]) for _v in _npVset))))), None, None, *tuple( (_v[-2], _v[-1]) for _v in _npVset)))) except IndexError: pass #weights[-1]*=4**sum(1 for x in Hp[0].GetVS() if str(x[-1])==str(x[-2]) and str(x[-1])=='0') #weights[-1]*=4**sum(1 for x in Hp[1].GetVS() if str(x[-1])==str(x[-2]) and str(x[-1])=='0') #weights.append(_prior) # P(Hm,Hf|ReadLST)=P(ReadLST|Hm,Hf)P(Hm,Hf)=P(ReadsLST|Hm,Hf)P(Offspring Genotypes|Hm, Hf) #if _prior>1e-10: # weights.append(1) # uninformative prior #else: # weights.append(0) # incompatible extension else: pass #all_incompatible_prob = (1+1e-10)*(genotype_err**2)**(len(Imputation)-2) if all( _x < 1e-60 for _x in weights ): # if no offspring extension derived from the parental extensions is compatible with the offspring genotypes, estimate the offspring genotype at s anew. This condition is NOT expected to occur with "redose" set to True. Check if all weights are zero taking numerical uncertainty into account. #if all(_x<all_incompatible_prob for _x in weights):# if no offspring extension derived from the parental extensions is compatible with the offspring genotypes, estimate the offspring genotype at s anew. This condition is NOT expected to occur with "redose" set to True. Check if all weights are zero taking numerical uncertainty into account. if not Impute_Incompatible: sys.stderr.write( "WARNING: Parental genotypes were incompatible with the offspring genotypes! Extension will be skipped at SNP {0:d}, position {1:d}!\n" .format(Imputation[0].GetS() + 1, Imputation[0].GetPos())) Uniques, weights = [], [] done = True else: attempt += 1 if attempt <= 2: sys.stderr.write( "WARNING: Parental genotypes were incompatible with the offspring genotypes! All of the genotypes will be imputed anew at SNP {0:d}, position {1:d}!\n" .format(Imputation[0].GetS() + 1, Imputation[0].GetPos())) _Imputations = Imputation[0:2] + [ _Glst[-1] for _Glst in Imputation[2:] ] try: _Imputations = [ Genotype(_G.GetS(), _G.GetPos(), *_alleles) for _G, _alleles in zip( _Imputations, ImputeGenotype(G[0].GetS(), ReadsLST, ploidy_levels, error_rate, QscoresLST, None)) ] except BlockException as e: sys.stderr.write( 'WARNING: ' + ''.join(e.args) + " Extension will be skipped at SNP {0:d}, position {1:d}!\n" .format(Imputation[0].GetS() + 1, Imputation[0].GetPos())) Uniques, weights = [], [] done = True except: raise else: Returned_Imputation = [_G for _G in _Imputations] Imputation = _Imputations[0:2] + [[ Genotype(_Gc.GetS(), _Gc.GetPos(), *_Gc.GetGenes()) for _Gc in _Glst[:-1] ] + [_Impute] for _Glst, _Impute in zip( G_offspring, _Imputations[2:])] #weights = [1./len(weights) for _w in weights] # Uninformative prior #done = True else: done = True if not done: sys.stderr.write( "WARNING: Parental genotypes were still incompatible with the offspring after imputation! Extension will be therefore skipped at SNP {0:d}, position {1:d}!\n" .format(Imputation[0].GetS() + 1, Imputation[0].GetPos())) Uniques, weights = [], [] _norm = float(sum(weights)) weights = [_w / _norm for _w in weights] return Uniques, weights, probs, Counts, Returned_Imputation
def where(mask, yes, no): return from_array(npwhere(mask, yes, no))
def simple_flare_find(times, mags, errs, smoothbinsize=97, flareminsigma=4.0, flaremaxcadencediff=1, flaremincadencepoints=3, magsarefluxes=False, savgolpolyorder=2, **savgolkwargs): '''This finds flares in time series using the method in Walkowicz+ 2011. Returns number of flares found, and their time indices. Args ---- times, mags, errs are numpy arrays for the time series. Kwargs ------ smoothbinsize: the number of consecutive light curve points to smooth over in the time series using a Savitsky-Golay filter. The smoothed light curve is then subtracted from the actual light curve to remove trends that potentially last smoothbinsize light curve points. The default value is chosen as ~6.5 hours (97 x 4 minute cadence for HATNet/HATSouth). flareminsigma: the minimum sigma above the median light curve level to designate points as belonging to possible flares flaremaxcadencediff: the maximum number of light curve points apart each possible flare event measurement is allowed to be. If this is 1, then we'll look for consecutive measurements. flaremincadencepoints: the minimum number of light curve points (each flaremaxcadencediff points apart) required that are at least flareminsigma above the median light curve level to call an event a flare. magsarefluxes: if True, indicates that mags is actually an array of fluxes. savgolpolyorder: the polynomial order of the function used by the Savitsky-Golay filter. Any remaining keyword arguments are passed directly to the savgol_filter function from scipy. ''' # if no errs are given, assume 0.1% errors if errs is None: errs = 0.001*mags # get rid of nans first finiteind = npisfinite(times) & npisfinite(mags) & npisfinite(errs) ftimes = times[finiteind] fmags = mags[finiteind] ferrs = errs[finiteind] # now get the smoothed mag series using the filter # kwargs are provided to the savgol_filter function smoothed = savgol_filter(fmags, smoothbinsize, savgolpolyorder, **savgolkwargs) subtracted = fmags - smoothed # calculate some stats # the series_median is ~zero after subtraction series_mad = npmedian(npabs(subtracted)) series_stdev = 1.483*series_mad # find extreme positive deviations if magsarefluxes: extind = npwhere(subtracted > (minflaresigma*series_stdev)) else: extind = npwhere(subtracted < (-minflaresigma*series_stdev)) # see if there are any extrema if extind and extind[0]: extrema_indices = extind[0] flaregroups = [] # find the deviations within the requested flaremaxcadencediff for ind, extrema_index in enumerate(extrema_indices): stuff_to_do()