Ejemplo n.º 1
0
def build_matrix(data_id, data, cols):
    if data[cols].isnull().values.any():
        data = data.corr(method="pearson")
        code = build_code_export(data_id)
        code.append(("corr_cols = [\n"
                     "\t'{corr_cols}'\n"
                     "]\n"
                     "corr_data = df[corr_cols]\n"
                     "{str_encodings}"
                     "corr_data = corr_data.corr(method='pearson')"))
    else:
        # using pandas.corr proved to be quite slow on large datasets so I moved to numpy:
        # https://stackoverflow.com/questions/48270953/pandas-corr-and-corrwith-very-slow
        data = np.corrcoef(data[cols].values, rowvar=False)
        data = pd.DataFrame(data, columns=cols, index=cols)
        code = build_code_export(
            data_id, imports="import numpy as np\nimport pandas as pd\n\n")
        code.append((
            "corr_cols = [\n"
            "\t'{corr_cols}'\n"
            "]\n"
            "corr_data = df[corr_cols]\n"
            "{str_encodings}"
            "corr_data = np.corrcoef(corr_data.values, rowvar=False)\n"
            "corr_data = pd.DataFrame(corr_data, columns=[corr_cols], index=[corr_cols])"
        ))

    code = "\n".join(code)
    return data, code
Ejemplo n.º 2
0
 def build(self):
     base_code = build_code_export(
         self.data_id,
         imports="{}\n\n".format("\n".join([
             "import numpy as np",
             "import pandas as pd",
             "import plotly.graph_objs as go",
         ])),
     )
     return_data, code = self.analysis.build(self)
     return dict(code=build_final_chart_code(base_code + code),
                 query=self.query,
                 cols=global_state.get_dtypes(self.data_id),
                 dtype=self.dtype,
                 chart_type=self.analysis_type,
                 **return_data)
Ejemplo n.º 3
0
    def __init__(self, data_id, req):
        self.data_id = data_id
        self.analysis_type = get_str_arg(req, "type")
        curr_settings = global_state.get_settings(data_id) or {}
        self.query = build_query(data_id, curr_settings.get("query"))
        data = load_filterable_data(data_id, req, query=self.query)
        self.selected_col = find_selected_column(
            data, get_str_arg(req, "col", "values")
        )
        self.data = data[~pd.isnull(data[self.selected_col])]
        self.dtype = find_dtype(self.data[self.selected_col])
        self.classifier = classify_type(self.dtype)
        self.code = build_code_export(
            data_id,
            imports="{}\n".format(
                "\n".join(
                    [
                        "import numpy as np",
                        "import pandas as pd",
                        "import plotly.graph_objs as go",
                    ]
                )
            ),
        )

        if self.analysis_type is None:
            self.analysis_type = (
                "histogram" if self.classifier in ["F", "I", "D"] else "value_counts"
            )

        if self.analysis_type == "geolocation":
            self.analysis = GeolocationAnalysis(req)
        elif self.analysis_type == "histogram":
            self.analysis = HistogramAnalysis(req)
        elif self.analysis_type == "categories":
            self.analysis = CategoryAnalysis(req)
        elif self.analysis_type == "value_counts":
            self.analysis = ValueCountAnalysis(req)
        elif self.analysis_type == "word_value_counts":
            self.analysis = WordValueCountAnalysis(req)
        elif self.analysis_type == "qq":
            self.analysis = QQAnalysis()