def logistic_extrap(country, field, until, start, cutoff, k_frac): """Extrapolates the the logistic fit of a field from start, only taking data from cutoff into account, until the fit reaches k_frac * k. """ prod = get_production(country, until)[field].dropna()[cutoff:] x, y = prod.index, prod.values nx = len(x) fname = os.path.join(DPATH, '%s_logistic_production_%s.pkl' % (country, until[:4])) if os.path.exists(fname): f = open(fname) rkp_dict = load(f) f.close() r, k, p = rkp_dict[field] else: r, k, p = fit_logistic(x, y, Timestamp(start)) if y[-1] > k_frac * k: return (x, y) while y[-1] < k_frac * k: x = list(x) last_date = x[-1] future = date_range(last_date, periods=120, freq='MS')[1:] x.extend(future) y = compute_logistic(x, (r, k, p), Timestamp(start)) _y = y[nx:] _y = _y[_y < k_frac * k] _x = x[nx:] _x = _x[:len(_y)] return (_x, _y)
def rate_of_discoveries(country, until='2013-02-01', fit_style='logistic', confid=None, show_plot=False, style='urr'): """Fits a logistic curve to the number of fields discovered up to time t. This informs us about the underlying discovery mechanism. This mechanism can depend on size. Args: country -> str: The string representing the name of the country. until -> str: datelike string useful for backtesting (left cutoff) Return: discoveries -> DataFrame: DataFrame containing all the discoveries. """ production = get_production(country, until) fields = production.columns start_of_prod = dict((field, production[field].dropna().index[0]) for field in fields) start_of_prod = Series(start_of_prod) categories = classify_fields_according_to_urr(country, until, style) activity = date_range(START[country], until, freq='MS') n_vs_ts= [] for category in categories: starts = array(sorted(start_of_prod[category])) n = [(starts <= start).sum() for start in activity] t = activity n_vs_ts.append((t, n)) #return n_vs_ts fit_params = [] fname = os.path.join(DPATH, '%s_%s_rate.pkl' % (country, until[:4])) if os.path.exists(fname): f = open(fname) rkp_dict = load(f) f.close() for i, n_vs_t in enumerate(n_vs_ts): x, y = n_vs_t[0], n_vs_t[1] if os.path.exists(fname): (r, k, p) = rkp_dict[i] else: #res = fit_logistic(x, y, START[country], confid, show_plot) (r, k, p) = fit_logistic(x, y, START[country], confid, show_plot) fit_params.append((r, k, p)) return fit_params