def find_instruments(self, y, X, exog_Z, candidates): s = self.map_column_to_sheet(y) df = s.df if np.isscalar(X): k = 1 else: k = len(X) if np.isscalar(candidates): print("Multiple instrument candidates required.") else: n = len(candidates) utterance = ( "This is the most inclusive group of jointly valid instruments among those" ) utterance = ( utterance + " suggested in a regression of " + str(y) + " on " + str(X) + " with " + str(exog_Z) ) utterance = utterance + " as a known exogenous instrument:\n\n" # base case: test them all Z_with_exog = np.append(candidates, exog_Z) j_pval = self.homoskedasticJStatistic(y, X, Z_with_exog).get_denotation().pval if j_pval > 0.05: return QueryResult(candidates, utterance + str(candidates)) turn = 1 a = np.arange(n) while turn <= n - k: combs = combinations(a, n - turn) Z = np.empty(n - turn, dtype=object) for group in list(combs): for i in range(len(group)): Z[i] = candidates[group[i]] Z_with_exog = np.append(Z, exog_Z) j_pval = ( self.homoskedasticJStatistic(y, X, Z_with_exog) .get_denotation() .pval ) if j_pval > 0.05: return QueryResult(Z, utterance + str(Z)) turn += 1 return QueryResult(None, "No valid groups of instruments found.")
def granger_p_value(self, results, dep, ind): return QueryResult("implemented shittily", "implemented shittily") r = results.test_causality(dep, ind, kind="f") utterance = ( "The p-value of the Granger causality test is " + str(r.p_value) + ".\n" ) return QueryResult(r.p_value, utterance)
def largestCorr(self, col): s = self.map_column_to_sheet(col) m = s.corr_matrix col_corrs = m[[col]] champ = -1 max = -2 for i in range(0, len(col_corrs)): c = abs(col_corrs.iloc[i, 0]) if c > max and c < 1: max = c champ = i if champ == -1: utterance = "ERROR: none found" return QueryResult(None, utterance) best_var = s.findVariableName(m, champ) utterance = ( "The variable most correlated with " + str(col) + " is " + str(best_var) + "." ) return QueryResult(best_var, utterance)
def homoskedasticJStatistic( self, y, X, Z, exog_regressors=-1, clean_data="greedy", covType="unadjusted" ): s = self.map_column_to_sheet(y) df = s.df arg_y = y arg_X = X arg_Z = Z # Check overidentification if type(X) is str: num_endogenous_regressors = 1 else: num_endogenous_regressors = len(X) if type(Z) is str: num_instruments = 1 else: num_instruments = len(Z) if num_instruments <= num_endogenous_regressors: utterance = "Underidentification Error: We need more instruments than endogenous regressors for this test." return QueryResult(None, utterance) # prepare data v = np.copy(X) v = np.append(v, y) v = np.append(v, Z) if exog_regressors != -1: v = np.append(v, exog_regressors) dfClean = s.cleanData(v, clean_data) X = dfClean[X] length = len(X) Z = dfClean[Z] y = dfClean[y] if exog_regressors != -1: exog_regressors = sm.add_constant(dfClean[exog_regressors]) else: exog_regressors = np.full((length, 1), 1) mod = IVGMM(y, exog_regressors, X, Z) res = mod.fit() j_stat = res.j_stat utterance = ( "\nThe homoskedastic Wald j-statistic test output in a regression of " + str(arg_y) + " on endogenous covariates " + str(arg_X) ) utterance = ( utterance + ", using " + str(arg_Z) + " as instruments is the following:\n\n" ) utterance = utterance + str(j_stat) + "\n\n" return QueryResult(j_stat, utterance)
def execute_func(self, func_name, args, denotation_only=True): try: result = self.get_names_to_lambdas()[func_name](*args) except: utter = "Oops, I had an error in my maths. Are you sure all your arguments are amenable to the type of function you're trying to run?" result = QueryResult(utter, utter) if denotation_only: return result.get_denotation() return result
def help(self, functions=None): if functions is None or functions == "generic": utterance = "" utterance += "For help about a specific topic, just ask about it. (e.g. 'How do I run a regression?')" return QueryResult(utterance, utterance) q = _create_helpstring(functions) utterance = q denotation = q return QueryResult(denotation, utterance)
def largestCorrList(self, col, num_return=3): s = self.map_column_to_sheet(col) df = s.df m = s.corr_matrix v = abs(m[[col]]) n = len(v) vAbs = np.zeros(n) for i in range(0, n): vAbs[i] = v.iloc[i, 0] # we want the (num_return + 1)'th largest because we don't it to count itself p = _kthLargest(vAbs, num_return + 1) returnVector = [] for i in range(0, n): c = vAbs[i] if c >= p and c < 1: returnVector.append(s.findVariableName(m, i)) utterance = ( "The " + str(num_return) + " variables most correlated with " + str(col) + " are " ) for i in range(0, len(returnVector) - 1): utterance = utterance + returnVector[i] + ", " utterance = utterance + "and " + returnVector[len(returnVector) - 1] + "." return QueryResult(returnVector, utterance)
def multiReg(self, y, X, clean_data="greedy"): s = self.map_column_to_sheet(y) y_arg = y X_arg = X # prepare data v = np.copy(X) v = np.append(v, y) dfClean = s.cleanData(v, clean_data) X = dfClean[X] y = dfClean[y] X = sm.add_constant(X) results = sm.OLS(y, X).fit() utterance = ( "Here are the results of a multivariate linear regression of " + str(y_arg) + " on " + str(X_arg) + ".\n\n" ) utterance = utterance + str(results.summary()) return QueryResult(results.summary(), utterance)
def poisson_regression(self, endog, exog, clean_data="greedy"): s = self.map_column_to_sheet(endog) arg_endog = endog arg_exog = exog # prepare data v = np.copy(exog) v = np.append(v, endog) dfClean = s.cleanData(v, clean_data) exog = sm.add_constant(dfClean[exog]) endog = dfClean[endog] poisson = Poisson(endog, exog) fit = poisson.fit() utterance = ( "Here are the results of a Poisson regression with endogenous variables " ) utterance = ( utterance + str(arg_endog) + " and exogenous variables " + str(arg_exog) + ".\n" ) utterance = utterance + str(fit.summary()) return QueryResult(fit.summary(), utterance)
def markov_switching_regime_regression( self, endog, k, exog_vars=-1, clean_data="greedy" ): s = self.map_column_to_sheet(endog) arg_endog = endog # prepare data v = np.copy(endog) if exog_vars != -1: v = np.append(v, exog_vars) dfClean = s.cleanData(v, clean_data) endog = dfClean[endog] else: endog = s.df[endog] if exog_vars == -1: exog_vars = None else: exog_vars = dfClean[exog_vars] mr = MarkovRegression(endog, k, exog=exog_vars) fit = mr.fit() utterance = ( "Here are the results of a dynamic Markov regression with endogenous variable " + str(arg_endog) ) utterance = utterance + " and " + str(k) + " regimes.\n" utterance = utterance + str(fit.summary()) return QueryResult(fit.summary(), utterance)
def reg(self, y, x, clean_data="greedy"): s = self.map_column_to_sheet(y) y_arg = y x_arg = x # prepare data v = np.array(x) v = np.append(v, y) dfClean = s.cleanData(v, clean_data) X = dfClean[x] y = dfClean[y] X = sm.add_constant(X) results = sm.OLS(y, X).fit() utterance = ( "Here are the results of a linear regression of " + str(y_arg) + " on " + str(x_arg) + ".\n\n" ) utterance = utterance + str(results.summary()) return QueryResult(results.summary(), utterance)
def find_optimal_lag_length( self, cols, time, min_lag=1, max_lag=8, criterion="aic" ): try: s = self.map_column_to_sheet(cols) multi = False except: s = self.map_column_to_sheet(cols[0]) multi = True df = s.df if multi: try: args_vector = np.append(cols, time) data = df[args_vector] data = data.set_index(time) except: data = df[cols] model = VAR(data) else: try: args_vector = np.array([cols, time]) data = df[args_vector] data = data.set_index(time) except: data = df[cols] model = s_ar.AR(data) info_loss = np.zeros(max_lag - min_lag + 1) if criterion == "aic": for i in range(max_lag - min_lag + 1): fit = model.fit(i + min_lag) info_loss[i] = fit.aic elif criterion == "bic": for i in range(max_lag - min_lag + 1): fit = model.fit(i + min_lag) info_loss[i] = fit.bic else: print("ERROR: Criterion argument not supported.") return x = np.argsort(info_loss) optimal = x[0] + min_lag utterance = ( "The optimal lag length according to the " + str(criterion) + " criterion is " ) utterance = utterance + str(optimal) + "." return QueryResult(optimal, utterance)
def showCol(self, col): colvals = np.array(list(self.get_column(col))) if len(colvals) > 10: colvals = f"[{colvals[0]}, {colvals[1]}, {colvals[2]}, {colvals[3]}, {colvals[4]}, ..., {colvals[5]}, {colvals[6]}, {colvals[7]}, {colvals[8]}, {colvals[9]}]" utterance = ( f"These are some of the values in column {col.upper()}: \n{str(colvals)} " ) return QueryResult(utterance, utterance)
def kmeans(self, k=5): numeric_cols = self.get_numeric_columns() kmeans = KMeans(n_clusters=k, random_state=0).fit(np.array(numeric_cols[0])) utterance = f"K-Means successfully converged after {kmeans.n_iter_} iterations. Only used numeric columns." utterance += f"\nHere are the cluster's {k} centers.\n" # utterance += f"\nHere are the cluster's {k} centers. The values are presented in the column order:\n" # utterance += f"\t\t{numeric_cols[1]}\n\n" for clust in kmeans.cluster_centers_: utterance += f"\t\t{[(round(val , 2)) for val in clust]}\n" return QueryResult(utterance, utterance)
def fixedEffects( self, y, x, id, year, entity_Effects=False, time_Effects=False, cov_Type="clustered", cluster_Entity=True, clean_data="greedy", ): if type(x) != str: utterance = ( "ERROR: Multiple independent regressor approach not yet implemented." ) return utterance s = self.map_column_to_sheet(y) # prepare data v = np.copy(x) v = np.append(v, y) df = s.cleanData(v, clean_data) # set up panel and return fit df = df.set_index([id, year]) mod = PanelOLS( df[y], df[x], entity_effects=entity_Effects, time_effects=time_Effects ) utterance = ( "Here are the results of a fixed effects regression of " + str(y) + " on " + str(x) ) utterance = ( utterance + ", using " + str(year) + " as the time dimension and " + str(id) + " as the id dimension.\n\n" ) utterance = utterance + str( mod.fit(cov_type=cov_Type, cluster_entity=cluster_Entity) ) return QueryResult( mod.fit(cov_type=cov_Type, cluster_entity=cluster_Entity), utterance )
def test_weak_instruments( self, x, Z, clean_the_data="greedy", covType="unadjusted" ): if type(x) != str: utterance = ( "Multiple endogenous regressors not yet implemented for this test." ) return QueryResult(None, utterance) s = self.map_column_to_sheet(x) x_arg = x Z_arg = Z # prepare data. use OLS because we just need first stage results v = np.copy(x) v = np.append(v, Z) dfClean = s.cleanData(v, clean_the_data) x = dfClean[x] Z = dfClean[Z] results = sm.OLS(x, Z).fit() # want F > 10 f = results.fvalue utterance = ( "The F-value in a regression with endogenous covariate " + str(x_arg) + " and instruments " + str(Z_arg) ) utterance = ( utterance + " is " + str(f) + ".\n Typically, we want F > 10 to have reliably strong estimates in the first stage." ) return QueryResult(f, utterance)
def findCorr(self, yColName, xColName): s = self.map_column_to_sheet(yColName) corr = s.corr_matrix[yColName][s.findColumnIndexGivenName(xColName)] utterance = ( "The correlation between " + str(yColName) + " and " + str(xColName) + " is " + str(corr) + "." ) return QueryResult(corr, utterance)
def graph(self, col_names): if not type(col_names) == list: col_names = [col_names] col_data = [self.get_column(col) for col in col_names] for idx, col in enumerate(col_data): plt.plot(np.arange(len(col)), col, label=col_names[idx].upper()) plt.xlabel("x - axis") plt.ylabel("y - axis") plt.legend() plt.show() return QueryResult("I made a graph for you!", "I made a graph for you!")
def print_PCA_wrapper(self, pca): utterance = "factors:\n" utterance = utterance + str(pca.factors) + "\n" utterance = utterance + "coefficients:\n" utterance = utterance + str(pca.coeff) + "\n" utterance = utterance + "eigenvalues:\n" utterance = utterance + str(pca.eigenvals) + "\n" utterance = utterance + "eigenvectors (ordered):\n" utterance = utterance + str(pca.eigenvecs) + "\n" utterance = utterance + "transformed data:\n" utterance = utterance + str(pca.transformed_data) + "\n" return QueryResult(pca.coeff, utterance)
def print_a_bunch_of_AR_shit(self, results): utterance = "Here are the results of the univariate time series:.\n\n" utterance = utterance + "Model Parameters:\n" + str(results.params) + "\n\n\n" utterance = ( utterance + "Parameter Confidence Intervals:\n" + str(results.conf_int()) + "\n\n\n" ) utterance = ( utterance + "Normalized Covariance Matrix Across Parameters:\n" + str(results.normalized_cov_params) + "\n\n\n" ) # train just on parameters return QueryResult(results.params, utterance)
def AR_with_moving_average(self, var, p, ma, the_dates, clean_data="greedy"): s = self.map_column_to_sheet(var) # prepare data dfClean = s.cleanData(var, clean_data) time_series = dfClean[var] arma = ARMA(time_series, np.array(p, ma), dates=the_dates) fit = arma.fit() utterance = ( "Here are the results of an ARMA regression of " + str(var) + " in " + str(the_dates) + ".\n" ) return QueryResult(fit.summary(), utterance + str(fit.summary()))
def classify(self, col_y, crossval=True): # First, assure that col is a categorical variable. y = self.get_column(col_y) y = y.copy() y = y.astype("category") # Get the training data s = self.map_column_to_sheet(col_y) X = s.get_numeric_columns()[0] # Setup our SVM svc = SVC(kernel="linear") svc_cv_results = cross_validate(svc, X, y, cv=5) svc_acc = round(svc_cv_results["test_score"].mean() * 100, 2) # Setup Naive Bayes gnb = GaussianNB() gnb_cv_results = cross_validate(gnb, X, y, cv=5) gnb_acc = round(gnb_cv_results["test_score"].mean() * 100, 2) # Set up Random Forest rdf = RandomForestClassifier(random_state=0) rdf_cv_results = cross_validate(rdf, X, y, cv=5) rdf_acc = round(rdf_cv_results["test_score"].mean() * 100, 2) def _name_best_method(): if svc_acc > gnb_acc: if svc_acc > rdf_acc: return "Support Vector Machines" return "Random Forst" if gnb_acc > rdf_acc: return "Gaussian Naive Bayes" return "Random Forest" utterance = f"Below are the best performing classification algorithms. All accuracies are the mean result using 5-fold cross validation.\n" utterance += f"\tSupport Vector Machines: {svc_acc}\n" utterance += f"\tGaussian Naive Bayes: {gnb_acc}\n" utterance += f"\tRandom Forest: {rdf_acc}\n" utterance += f"We recomend using {_name_best_method()}. You can search for optimized hyperparameters using Athena as well." return QueryResult(utterance, utterance)
def augmented_dicky_fuller_test(self, var, max_lag=-1): s = self.map_column_to_sheet(var) df = s.df time_series = df[var] if max_lag == -1: vector = s_st.adfuller(time_series) else: vector = s_st.adfuller(time_series, maxlag=max_lag) utterance = "Here is the p-value of an augmented Dicky-Fuller (stationarity) test with variable " utterance = utterance + str(var) + "." utterance = ( utterance + "The null hypothesis is that the process has a unit root. The lower the p-value, " ) utterance = utterance + "the stronger the case for stationarity.\n" utterance = utterance + str(vector[1]) return QueryResult(vector[1], utterance)
def overallLargestCorrs(self, num_return=5, index=0): s = self.sheets[index] m = s.corr_matrix n = len(m) v = np.zeros(n * n) for i in range(0, n): for j in range(i, n): element = abs(m.iloc[i, j]) v[_dfToVectorIndex(n, i, j)] = element p = _kthLargest(v, num_return + 1) r = np.zeros(num_return) j = 0 for i in range(0, len(v)): vi = v[i] if vi >= p and vi < 1: r[j] = i j += 1 returnVector = [] for i in range(0, len(r)): ri = r[i] t = _vectortoDFIndex(n, ri) returnVector.append(s.findVariableName(m, t)) utterance = ( "Here are the " + str(num_return) + " most correlative pairwise relationships in the dataset\n" ) utterance = utterance + str(returnVector) return QueryResult(returnVector, utterance)
def findMedian(self, col): s = self.map_column_to_sheet(col) df = s.df median = np.median(df[[col]]) utterance = "The median of " + str(col) + " is " + str(median) + "." return QueryResult(median, utterance)
def greeting(self): utterance = "Great to meet you! Have fun using the app." return QueryResult(69, utterance)
def graphVs(self, col_x, col_y): plt.plot(self.get_column(col_x), self.get_column(col_y)) plt.xlabel(col_x) plt.ylabel(col_y) plt.show() return QueryResult("I made a graph for you!", "I made a graph for you!")
def listCols(self): col_names = [col.upper() for col in self.get_column_names()] utterance = f"The column names in this dataset: \n{col_names} " return QueryResult(utterance, utterance)
def granger_causality_test(self, results, dep, ind): r = results.test_causality(dep, ind, kind="f") utterance = "Here is a summary of the results of the Granger causality test.\n" return QueryResult(r.summary(), utterance + str(r.summary()))
def analyze_lags(self, cols, time, preferred_criterion="aic", min_lag=1, max_lag=8): try: s = self.map_column_to_sheet(cols) multi = False except: s = self.map_column_to_sheet(cols[0]) multi = True df = s.df if multi: try: args_vector = np.append(cols, time) data = df[args_vector] data = data.set_index(time) except: data = df[cols] model = VAR(data) else: try: args_vector = np.array([cols, time]) data = df[args_vector] data = data.set_index(time) except: data = df[cols] model = s_ar.AR(data) aic = np.zeros(max_lag - min_lag + 1) bic = np.zeros(max_lag - min_lag + 1) for i in range(max_lag - min_lag + 1): fit = model.fit(i + min_lag) aic[i] = fit.aic bic[i] = fit.bic utterance = "" for i in range(max_lag - min_lag + 1): utterance = ( utterance + "AIC (" + str(i + min_lag) + " lags): " + str(aic[i]) + "\n" ) utterance = utterance + "\n\n" for i in range(max_lag - min_lag + 1): utterance = ( utterance + "BIC (" + str(i + min_lag) + " lags): " + str(bic[i]) + "\n" ) utterance = utterance + "\n\n" x = np.argsort(aic) champ = aic[x[0]] utterance = ( utterance + "Using AIC, here are the estimated proportional probabilities, using the best as a reference:" ) utterance = utterance + "\n" for i in range(max_lag - min_lag + 1): utterance = ( utterance + str(i + min_lag) + " lags: " + str(find_prob_given_AIC(champ, aic[i])) + "\n" ) optimal = self.find_optimal_lag_length( cols, time, min_lag=min_lag, max_lag=max_lag, criterion=preferred_criterion ).get_denotation() return QueryResult(optimal, utterance)