def find_instruments(self, y, X, exog_Z, candidates):

        s = self.map_column_to_sheet(y)
        df = s.df

        if np.isscalar(X):
            k = 1
        else:
            k = len(X)

        if np.isscalar(candidates):
            print("Multiple instrument candidates required.")
        else:
            n = len(candidates)

        utterance = (
            "This is the most inclusive group of jointly valid instruments among those"
        )
        utterance = (
            utterance
            + " suggested in a regression of "
            + str(y)
            + " on "
            + str(X)
            + " with "
            + str(exog_Z)
        )
        utterance = utterance + " as a known exogenous instrument:\n\n"

        # base case: test them all
        Z_with_exog = np.append(candidates, exog_Z)
        j_pval = self.homoskedasticJStatistic(y, X, Z_with_exog).get_denotation().pval
        if j_pval > 0.05:
            return QueryResult(candidates, utterance + str(candidates))

        turn = 1
        a = np.arange(n)
        while turn <= n - k:

            combs = combinations(a, n - turn)

            Z = np.empty(n - turn, dtype=object)

            for group in list(combs):
                for i in range(len(group)):
                    Z[i] = candidates[group[i]]

                Z_with_exog = np.append(Z, exog_Z)
                j_pval = (
                    self.homoskedasticJStatistic(y, X, Z_with_exog)
                    .get_denotation()
                    .pval
                )

                if j_pval > 0.05:
                    return QueryResult(Z, utterance + str(Z))

            turn += 1

        return QueryResult(None, "No valid groups of instruments found.")
 def granger_p_value(self, results, dep, ind):
     return QueryResult("implemented shittily", "implemented shittily")
     r = results.test_causality(dep, ind, kind="f")
     utterance = (
         "The p-value of the Granger causality test is " + str(r.p_value) + ".\n"
     )
     return QueryResult(r.p_value, utterance)
    def largestCorr(self, col):

        s = self.map_column_to_sheet(col)
        m = s.corr_matrix
        col_corrs = m[[col]]
        champ = -1
        max = -2

        for i in range(0, len(col_corrs)):
            c = abs(col_corrs.iloc[i, 0])
            if c > max and c < 1:
                max = c
                champ = i

        if champ == -1:
            utterance = "ERROR: none found"
            return QueryResult(None, utterance)

        best_var = s.findVariableName(m, champ)
        utterance = (
            "The variable most correlated with "
            + str(col)
            + " is "
            + str(best_var)
            + "."
        )
        return QueryResult(best_var, utterance)
    def homoskedasticJStatistic(
        self, y, X, Z, exog_regressors=-1, clean_data="greedy", covType="unadjusted"
    ):

        s = self.map_column_to_sheet(y)
        df = s.df

        arg_y = y
        arg_X = X
        arg_Z = Z

        # Check overidentification
        if type(X) is str:
            num_endogenous_regressors = 1
        else:
            num_endogenous_regressors = len(X)
        if type(Z) is str:
            num_instruments = 1
        else:
            num_instruments = len(Z)

        if num_instruments <= num_endogenous_regressors:
            utterance = "Underidentification Error: We need more instruments than endogenous regressors for this test."
            return QueryResult(None, utterance)

        # prepare data
        v = np.copy(X)
        v = np.append(v, y)
        v = np.append(v, Z)
        if exog_regressors != -1:
            v = np.append(v, exog_regressors)
        dfClean = s.cleanData(v, clean_data)
        X = dfClean[X]
        length = len(X)
        Z = dfClean[Z]
        y = dfClean[y]
        if exog_regressors != -1:
            exog_regressors = sm.add_constant(dfClean[exog_regressors])
        else:
            exog_regressors = np.full((length, 1), 1)
        mod = IVGMM(y, exog_regressors, X, Z)
        res = mod.fit()
        j_stat = res.j_stat

        utterance = (
            "\nThe homoskedastic Wald j-statistic test output in a regression of "
            + str(arg_y)
            + " on endogenous covariates "
            + str(arg_X)
        )
        utterance = (
            utterance
            + ", using "
            + str(arg_Z)
            + " as instruments is the following:\n\n"
        )
        utterance = utterance + str(j_stat) + "\n\n"

        return QueryResult(j_stat, utterance)
 def execute_func(self, func_name, args, denotation_only=True):
     try:
         result = self.get_names_to_lambdas()[func_name](*args)
     except:
         utter = "Oops, I had an error in my maths. Are you sure all your arguments are amenable to the type of function you're trying to run?"
         result = QueryResult(utter, utter)
     if denotation_only:
         return result.get_denotation()
     return result
    def help(self, functions=None):
        if functions is None or functions == "generic":
            utterance = ""
            utterance += "For help about a specific topic, just ask about it. (e.g. 'How do I run a regression?')"
            return QueryResult(utterance, utterance)

        q = _create_helpstring(functions)

        utterance = q
        denotation = q
        return QueryResult(denotation, utterance)
    def largestCorrList(self, col, num_return=3):

        s = self.map_column_to_sheet(col)
        df = s.df
        m = s.corr_matrix
        v = abs(m[[col]])
        n = len(v)
        vAbs = np.zeros(n)

        for i in range(0, n):
            vAbs[i] = v.iloc[i, 0]

        # we want the (num_return + 1)'th largest because we don't it to count itself
        p = _kthLargest(vAbs, num_return + 1)

        returnVector = []

        for i in range(0, n):
            c = vAbs[i]
            if c >= p and c < 1:
                returnVector.append(s.findVariableName(m, i))

        utterance = (
            "The "
            + str(num_return)
            + " variables most correlated with "
            + str(col)
            + " are "
        )
        for i in range(0, len(returnVector) - 1):
            utterance = utterance + returnVector[i] + ", "

        utterance = utterance + "and " + returnVector[len(returnVector) - 1] + "."

        return QueryResult(returnVector, utterance)
    def multiReg(self, y, X, clean_data="greedy"):

        s = self.map_column_to_sheet(y)
        y_arg = y
        X_arg = X

        # prepare data
        v = np.copy(X)
        v = np.append(v, y)
        dfClean = s.cleanData(v, clean_data)
        X = dfClean[X]
        y = dfClean[y]
        X = sm.add_constant(X)

        results = sm.OLS(y, X).fit()

        utterance = (
            "Here are the results of a multivariate linear regression of "
            + str(y_arg)
            + " on "
            + str(X_arg)
            + ".\n\n"
        )
        utterance = utterance + str(results.summary())

        return QueryResult(results.summary(), utterance)
    def poisson_regression(self, endog, exog, clean_data="greedy"):

        s = self.map_column_to_sheet(endog)

        arg_endog = endog
        arg_exog = exog

        # prepare data
        v = np.copy(exog)
        v = np.append(v, endog)
        dfClean = s.cleanData(v, clean_data)
        exog = sm.add_constant(dfClean[exog])
        endog = dfClean[endog]

        poisson = Poisson(endog, exog)
        fit = poisson.fit()

        utterance = (
            "Here are the results of a Poisson regression with endogenous variables "
        )
        utterance = (
            utterance
            + str(arg_endog)
            + " and exogenous variables "
            + str(arg_exog)
            + ".\n"
        )
        utterance = utterance + str(fit.summary())

        return QueryResult(fit.summary(), utterance)
    def markov_switching_regime_regression(
        self, endog, k, exog_vars=-1, clean_data="greedy"
    ):

        s = self.map_column_to_sheet(endog)

        arg_endog = endog

        # prepare data
        v = np.copy(endog)
        if exog_vars != -1:
            v = np.append(v, exog_vars)
            dfClean = s.cleanData(v, clean_data)
            endog = dfClean[endog]

        else:
            endog = s.df[endog]

        if exog_vars == -1:
            exog_vars = None
        else:
            exog_vars = dfClean[exog_vars]

        mr = MarkovRegression(endog, k, exog=exog_vars)
        fit = mr.fit()
        utterance = (
            "Here are the results of a dynamic Markov regression with endogenous variable "
            + str(arg_endog)
        )
        utterance = utterance + " and " + str(k) + " regimes.\n"
        utterance = utterance + str(fit.summary())

        return QueryResult(fit.summary(), utterance)
    def reg(self, y, x, clean_data="greedy"):

        s = self.map_column_to_sheet(y)
        y_arg = y
        x_arg = x

        # prepare data
        v = np.array(x)
        v = np.append(v, y)
        dfClean = s.cleanData(v, clean_data)
        X = dfClean[x]
        y = dfClean[y]
        X = sm.add_constant(X)

        results = sm.OLS(y, X).fit()
        utterance = (
            "Here are the results of a linear regression of "
            + str(y_arg)
            + " on "
            + str(x_arg)
            + ".\n\n"
        )
        utterance = utterance + str(results.summary())

        return QueryResult(results.summary(), utterance)
    def find_optimal_lag_length(
        self, cols, time, min_lag=1, max_lag=8, criterion="aic"
    ):

        try:
            s = self.map_column_to_sheet(cols)
            multi = False
        except:
            s = self.map_column_to_sheet(cols[0])
            multi = True

        df = s.df

        if multi:
            try:
                args_vector = np.append(cols, time)
                data = df[args_vector]
                data = data.set_index(time)
            except:
                data = df[cols]

            model = VAR(data)

        else:
            try:
                args_vector = np.array([cols, time])
                data = df[args_vector]
                data = data.set_index(time)
            except:
                data = df[cols]

            model = s_ar.AR(data)

        info_loss = np.zeros(max_lag - min_lag + 1)

        if criterion == "aic":
            for i in range(max_lag - min_lag + 1):
                fit = model.fit(i + min_lag)
                info_loss[i] = fit.aic

        elif criterion == "bic":
            for i in range(max_lag - min_lag + 1):
                fit = model.fit(i + min_lag)
                info_loss[i] = fit.bic

        else:
            print("ERROR: Criterion argument not supported.")
            return

        x = np.argsort(info_loss)
        optimal = x[0] + min_lag

        utterance = (
            "The optimal lag length according to the "
            + str(criterion)
            + " criterion is "
        )
        utterance = utterance + str(optimal) + "."

        return QueryResult(optimal, utterance)
 def showCol(self, col):
     colvals = np.array(list(self.get_column(col)))
     if len(colvals) > 10:
         colvals = f"[{colvals[0]}, {colvals[1]}, {colvals[2]}, {colvals[3]}, {colvals[4]}, ..., {colvals[5]}, {colvals[6]}, {colvals[7]}, {colvals[8]}, {colvals[9]}]"
     utterance = (
         f"These are some of the values in column {col.upper()}: \n{str(colvals)} "
     )
     return QueryResult(utterance, utterance)
    def kmeans(self, k=5):
        numeric_cols = self.get_numeric_columns()
        kmeans = KMeans(n_clusters=k, random_state=0).fit(np.array(numeric_cols[0]))
        utterance = f"K-Means successfully converged after {kmeans.n_iter_} iterations. Only used numeric columns."
        utterance += f"\nHere are the cluster's {k} centers.\n"
        # utterance += f"\nHere are the cluster's {k} centers. The values are presented in the column order:\n"
        # utterance += f"\t\t{numeric_cols[1]}\n\n"
        for clust in kmeans.cluster_centers_:
            utterance += f"\t\t{[(round(val , 2)) for val in clust]}\n"

        return QueryResult(utterance, utterance)
    def fixedEffects(
        self,
        y,
        x,
        id,
        year,
        entity_Effects=False,
        time_Effects=False,
        cov_Type="clustered",
        cluster_Entity=True,
        clean_data="greedy",
    ):

        if type(x) != str:
            utterance = (
                "ERROR: Multiple independent regressor approach not yet implemented."
            )
            return utterance

        s = self.map_column_to_sheet(y)

        # prepare data
        v = np.copy(x)
        v = np.append(v, y)
        df = s.cleanData(v, clean_data)

        # set up panel and return fit
        df = df.set_index([id, year])

        mod = PanelOLS(
            df[y], df[x], entity_effects=entity_Effects, time_effects=time_Effects
        )
        utterance = (
            "Here are the results of a fixed effects regression of "
            + str(y)
            + " on "
            + str(x)
        )
        utterance = (
            utterance
            + ", using "
            + str(year)
            + " as the time dimension and "
            + str(id)
            + " as the id dimension.\n\n"
        )
        utterance = utterance + str(
            mod.fit(cov_type=cov_Type, cluster_entity=cluster_Entity)
        )

        return QueryResult(
            mod.fit(cov_type=cov_Type, cluster_entity=cluster_Entity), utterance
        )
    def test_weak_instruments(
        self, x, Z, clean_the_data="greedy", covType="unadjusted"
    ):

        if type(x) != str:
            utterance = (
                "Multiple endogenous regressors not yet implemented for this test."
            )
            return QueryResult(None, utterance)

        s = self.map_column_to_sheet(x)

        x_arg = x
        Z_arg = Z

        # prepare data. use OLS because we just need first stage results
        v = np.copy(x)
        v = np.append(v, Z)
        dfClean = s.cleanData(v, clean_the_data)
        x = dfClean[x]
        Z = dfClean[Z]

        results = sm.OLS(x, Z).fit()

        # want F > 10
        f = results.fvalue
        utterance = (
            "The F-value in a regression with endogenous covariate "
            + str(x_arg)
            + " and instruments "
            + str(Z_arg)
        )
        utterance = (
            utterance
            + " is "
            + str(f)
            + ".\n Typically, we want F > 10 to have reliably strong estimates in the first stage."
        )

        return QueryResult(f, utterance)
 def findCorr(self, yColName, xColName):
     s = self.map_column_to_sheet(yColName)
     corr = s.corr_matrix[yColName][s.findColumnIndexGivenName(xColName)]
     utterance = (
         "The correlation between "
         + str(yColName)
         + " and "
         + str(xColName)
         + " is "
         + str(corr)
         + "."
     )
     return QueryResult(corr, utterance)
    def graph(self, col_names):
        if not type(col_names) == list:
            col_names = [col_names]

        col_data = [self.get_column(col) for col in col_names]
        for idx, col in enumerate(col_data):
            plt.plot(np.arange(len(col)), col, label=col_names[idx].upper())

        plt.xlabel("x - axis")
        plt.ylabel("y - axis")
        plt.legend()
        plt.show()
        return QueryResult("I made a graph for you!", "I made a graph for you!")
    def print_PCA_wrapper(self, pca):

        utterance = "factors:\n"
        utterance = utterance + str(pca.factors) + "\n"

        utterance = utterance + "coefficients:\n"
        utterance = utterance + str(pca.coeff) + "\n"

        utterance = utterance + "eigenvalues:\n"
        utterance = utterance + str(pca.eigenvals) + "\n"

        utterance = utterance + "eigenvectors (ordered):\n"
        utterance = utterance + str(pca.eigenvecs) + "\n"

        utterance = utterance + "transformed data:\n"
        utterance = utterance + str(pca.transformed_data) + "\n"

        return QueryResult(pca.coeff, utterance)
    def print_a_bunch_of_AR_shit(self, results):

        utterance = "Here are the results of the univariate time series:.\n\n"
        utterance = utterance + "Model Parameters:\n" + str(results.params) + "\n\n\n"
        utterance = (
            utterance
            + "Parameter Confidence Intervals:\n"
            + str(results.conf_int())
            + "\n\n\n"
        )
        utterance = (
            utterance
            + "Normalized Covariance Matrix Across Parameters:\n"
            + str(results.normalized_cov_params)
            + "\n\n\n"
        )

        # train just on parameters
        return QueryResult(results.params, utterance)
    def AR_with_moving_average(self, var, p, ma, the_dates, clean_data="greedy"):

        s = self.map_column_to_sheet(var)

        # prepare data
        dfClean = s.cleanData(var, clean_data)
        time_series = dfClean[var]

        arma = ARMA(time_series, np.array(p, ma), dates=the_dates)
        fit = arma.fit()

        utterance = (
            "Here are the results of an ARMA regression of "
            + str(var)
            + " in "
            + str(the_dates)
            + ".\n"
        )
        return QueryResult(fit.summary(), utterance + str(fit.summary()))
    def classify(self, col_y, crossval=True):

        # First, assure that col is a categorical variable.
        y = self.get_column(col_y)
        y = y.copy()
        y = y.astype("category")

        # Get the training data
        s = self.map_column_to_sheet(col_y)
        X = s.get_numeric_columns()[0]

        # Setup our SVM
        svc = SVC(kernel="linear")
        svc_cv_results = cross_validate(svc, X, y, cv=5)
        svc_acc = round(svc_cv_results["test_score"].mean() * 100, 2)

        # Setup Naive Bayes
        gnb = GaussianNB()
        gnb_cv_results = cross_validate(gnb, X, y, cv=5)
        gnb_acc = round(gnb_cv_results["test_score"].mean() * 100, 2)

        # Set up Random Forest
        rdf = RandomForestClassifier(random_state=0)
        rdf_cv_results = cross_validate(rdf, X, y, cv=5)
        rdf_acc = round(rdf_cv_results["test_score"].mean() * 100, 2)

        def _name_best_method():
            if svc_acc > gnb_acc:
                if svc_acc > rdf_acc:
                    return "Support Vector Machines"
                return "Random Forst"
            if gnb_acc > rdf_acc:
                return "Gaussian Naive Bayes"
            return "Random Forest"

        utterance = f"Below are the best performing classification algorithms. All accuracies are the mean result using 5-fold cross validation.\n"
        utterance += f"\tSupport Vector Machines: {svc_acc}\n"
        utterance += f"\tGaussian Naive Bayes: {gnb_acc}\n"
        utterance += f"\tRandom Forest: {rdf_acc}\n"
        utterance += f"We recomend using {_name_best_method()}. You can search for optimized hyperparameters using Athena as well."

        return QueryResult(utterance, utterance)
    def augmented_dicky_fuller_test(self, var, max_lag=-1):

        s = self.map_column_to_sheet(var)
        df = s.df
        time_series = df[var]

        if max_lag == -1:
            vector = s_st.adfuller(time_series)
        else:
            vector = s_st.adfuller(time_series, maxlag=max_lag)

        utterance = "Here is the p-value of an augmented Dicky-Fuller (stationarity) test with variable "
        utterance = utterance + str(var) + "."
        utterance = (
            utterance
            + "The null hypothesis is that the process has a unit root. The lower the p-value, "
        )
        utterance = utterance + "the stronger the case for stationarity.\n"
        utterance = utterance + str(vector[1])

        return QueryResult(vector[1], utterance)
    def overallLargestCorrs(self, num_return=5, index=0):

        s = self.sheets[index]
        m = s.corr_matrix
        n = len(m)

        v = np.zeros(n * n)

        for i in range(0, n):
            for j in range(i, n):
                element = abs(m.iloc[i, j])
                v[_dfToVectorIndex(n, i, j)] = element

        p = _kthLargest(v, num_return + 1)
        r = np.zeros(num_return)
        j = 0

        for i in range(0, len(v)):
            vi = v[i]
            if vi >= p and vi < 1:
                r[j] = i
                j += 1

        returnVector = []

        for i in range(0, len(r)):
            ri = r[i]
            t = _vectortoDFIndex(n, ri)
            returnVector.append(s.findVariableName(m, t))

        utterance = (
            "Here are the "
            + str(num_return)
            + " most correlative pairwise relationships in the dataset\n"
        )
        utterance = utterance + str(returnVector)

        return QueryResult(returnVector, utterance)
 def findMedian(self, col):
     s = self.map_column_to_sheet(col)
     df = s.df
     median = np.median(df[[col]])
     utterance = "The median of " + str(col) + " is " + str(median) + "."
     return QueryResult(median, utterance)
 def greeting(self):
     utterance = "Great to meet you! Have fun using the app."
     return QueryResult(69, utterance)
 def graphVs(self, col_x, col_y):
     plt.plot(self.get_column(col_x), self.get_column(col_y))
     plt.xlabel(col_x)
     plt.ylabel(col_y)
     plt.show()
     return QueryResult("I made a graph for you!", "I made a graph for you!")
 def listCols(self):
     col_names = [col.upper() for col in self.get_column_names()]
     utterance = f"The column names in this dataset: \n{col_names} "
     return QueryResult(utterance, utterance)
 def granger_causality_test(self, results, dep, ind):
     r = results.test_causality(dep, ind, kind="f")
     utterance = "Here is a summary of the results of the Granger causality test.\n"
     return QueryResult(r.summary(), utterance + str(r.summary()))
    def analyze_lags(self, cols, time, preferred_criterion="aic", min_lag=1, max_lag=8):

        try:
            s = self.map_column_to_sheet(cols)
            multi = False
        except:
            s = self.map_column_to_sheet(cols[0])
            multi = True

        df = s.df

        if multi:
            try:
                args_vector = np.append(cols, time)
                data = df[args_vector]
                data = data.set_index(time)
            except:
                data = df[cols]

            model = VAR(data)

        else:
            try:
                args_vector = np.array([cols, time])
                data = df[args_vector]
                data = data.set_index(time)
            except:
                data = df[cols]

            model = s_ar.AR(data)

        aic = np.zeros(max_lag - min_lag + 1)
        bic = np.zeros(max_lag - min_lag + 1)

        for i in range(max_lag - min_lag + 1):
            fit = model.fit(i + min_lag)
            aic[i] = fit.aic
            bic[i] = fit.bic

        utterance = ""
        for i in range(max_lag - min_lag + 1):
            utterance = (
                utterance + "AIC (" + str(i + min_lag) + " lags): " + str(aic[i]) + "\n"
            )

        utterance = utterance + "\n\n"

        for i in range(max_lag - min_lag + 1):
            utterance = (
                utterance + "BIC (" + str(i + min_lag) + " lags): " + str(bic[i]) + "\n"
            )

        utterance = utterance + "\n\n"

        x = np.argsort(aic)
        champ = aic[x[0]]
        utterance = (
            utterance
            + "Using AIC, here are the estimated proportional probabilities, using the best as a reference:"
        )
        utterance = utterance + "\n"
        for i in range(max_lag - min_lag + 1):
            utterance = (
                utterance
                + str(i + min_lag)
                + " lags: "
                + str(find_prob_given_AIC(champ, aic[i]))
                + "\n"
            )

        optimal = self.find_optimal_lag_length(
            cols, time, min_lag=min_lag, max_lag=max_lag, criterion=preferred_criterion
        ).get_denotation()

        return QueryResult(optimal, utterance)