Example #1
0
 def build_column(self, data):
     left_col, right_col, algo = (self.cfg.get(p)
                                  for p in ["left", "right", "algo"])
     normalized = self.cfg.get("normalized", False)
     if algo == "levenshtein":
         if normalized:
             similarity = strsimpy.normalized_levenshtein.NormalizedLevenshtein(
             )
         else:
             similarity = strsimpy.levenshtein.Levenshtein()
     elif algo == "damerau-leveneshtein":
         similarity = strsimpy.damerau.Damerau()
         if normalized:
             similarity = SimilarityNormalizeWrapper(similarity)
     elif algo == "jaro-winkler":
         similarity = JaroWinkler()
     elif algo == "jaccard":
         similarity = strsimpy.jaccard.Jaccard(int(self.cfg.get("k", 3)))
         if normalized:
             similarity = SimilarityNormalizeWrapper(similarity)
     distances = apply(
         data[[left_col, right_col]].fillna(""),
         lambda rec: similarity.distance(*rec),
         axis=1,
     )
     return pd.Series(distances, index=data.index, name=self.name)
Example #2
0
    def build(self, parent):
        if parent.classifier == "D":
            parent.data.loc[:, parent.selected_col] = apply(
                parent.data[parent.selected_col], json_timestamp)
        kde_code = []
        if self.target is None:
            return_data, hist_labels = self.build_histogram_data(
                parent.data[parent.selected_col])
            kde, kde_code = build_kde(parent.data[parent.selected_col],
                                      hist_labels, parent.selected_col)
            if kde is not None:
                return_data["kde"] = kde
        else:
            return_data = {"targets": [], "labels": list(range(self.bins))}
            target_dtype = find_dtype(parent.data[self.target])
            target_formatter = find_dtype_formatter(target_dtype)
            for target, target_data in parent.data[[
                    self.target, parent.selected_col
            ]].groupby(self.target):
                target_data, _ = self.build_histogram_data(
                    target_data[parent.selected_col])
                target_data["target"] = target_formatter(target,
                                                         as_string=True)
                return_data["targets"].append(target_data)

        desc, desc_code = load_describe(parent.data[parent.selected_col])
        dtype_info = global_state.get_dtype_info(parent.data_id,
                                                 parent.selected_col)
        for p in ["skew", "kurt"]:
            if p in dtype_info:
                desc[p] = dtype_info[p]

        return_data["desc"] = desc
        return return_data, self._build_code(parent, kde_code, desc_code)
Example #3
0
    def build(self, parent):
        if parent.classifier == "D":
            parent.data.loc[:, parent.selected_col] = apply(
                parent.data[parent.selected_col], json_timestamp)
        hist_data, hist_labels = np.histogram(parent.data[parent.selected_col],
                                              bins=self.bins)
        hist_data = [json_float(h) for h in hist_data]
        return_data = dict(
            labels=["{0:.1f}".format(lbl) for lbl in hist_labels[1:]
                    ],  # drop the first bin because of just a minimum
            data=hist_data,
        )
        kde, kde_code = build_kde(parent.data[parent.selected_col],
                                  hist_labels, parent.selected_col)
        if kde is not None:
            return_data["kde"] = kde
        desc, desc_code = load_describe(parent.data[parent.selected_col])
        dtype_info = global_state.get_dtype_info(parent.data_id,
                                                 parent.selected_col)
        for p in ["skew", "kurt"]:
            if p in dtype_info:
                desc[p] = dtype_info[p]

        return_data["desc"] = desc
        return return_data, self._build_code(parent, kde_code, desc_code)
Example #4
0
    def build(self, parent):
        s = parent.data[parent.selected_col]
        if parent.classifier == "D":
            s = apply(s, json_timestamp)

        qq_x, qq_y = sts.probplot(s, dist="norm", fit=False)
        qq = pd.DataFrame(dict(x=qq_x, y=qq_y))
        f = grid_formatter(grid_columns(qq), nan_display=None)
        return_data = dict(data=f.format_dicts(qq.itertuples()))
        return_data["min"] = f.fmts[0][-1](qq.min()[0].min(), None)
        return_data["max"] = f.fmts[0][-1](qq.max()[0].max(), None)
        return return_data, self._build_code(parent)
Example #5
0
    def build(self, parent):
        s = parent.data[parent.selected_col]
        if parent.classifier == "D":
            s = apply(s, json_timestamp)

        qq_x, qq_y = sts.probplot(s, dist="norm", fit=False)
        qq = pd.DataFrame(dict(x=qq_x, y=qq_y))
        f = grid_formatter(grid_columns(qq), nan_display=None)
        return_data = f.format_lists(qq)

        trend_line = px.scatter(x=qq_x, y=qq_y, trendline="ols").data[1]
        trend_line = pd.DataFrame(dict(x=trend_line["x"], y=trend_line["y"]))
        f = grid_formatter(grid_columns(trend_line), nan_display=None)
        trend_line = f.format_lists(trend_line)
        return_data["x2"] = trend_line["x"]
        return_data["y2"] = trend_line["y"]
        return return_data, self._build_code(parent)
Example #6
0
    def build(self, parent):
        if parent.classifier == "D":
            parent.data.loc[:, parent.selected_col] = apply(
                parent.data[parent.selected_col], json_timestamp
            )
        kde_code = []
        if self.target is None:
            return_data, hist_labels = self.build_histogram_data(
                parent.data[parent.selected_col]
            )
            kde, kde_code = build_kde(
                parent.data[parent.selected_col], hist_labels, parent.selected_col
            )
            if kde is not None:
                return_data["kde"] = kde
        else:
            bin_vals = pd.cut(parent.data[parent.selected_col], bins=self.bins)
            labels = ["{}".format(c) for c in bin_vals.dtype.categories]
            parent.data.loc[:, "bin"] = bin_vals.astype("str")
            return_data = {"targets": [], "labels": labels}
            target_dtype = find_dtype(parent.data[self.target])
            target_formatter = find_dtype_formatter(target_dtype)
            for target, target_data in parent.data[[self.target, "bin"]].groupby(
                self.target
            ):
                target_counts = target_data["bin"].value_counts()
                target_counts = [
                    int(tc) for tc in target_counts.reindex(labels, fill_value=0).values
                ]
                return_data["targets"].append(
                    dict(
                        target=target_formatter(target, as_string=True),
                        data=target_counts,
                    )
                )

        desc, desc_code = load_describe(parent.data[parent.selected_col])
        dtype_info = global_state.get_dtype_info(parent.data_id, parent.selected_col)
        for p in ["skew", "kurt"]:
            if p in dtype_info:
                desc[p] = dtype_info[p]

        return_data["desc"] = desc
        return return_data, self._build_code(parent, kde_code, desc_code)
Example #7
0
def clean(s, cleaner, cfg):
    if cleaner == "drop_multispace":
        return s.str.replace(r"[ ]+", " ")
    elif cleaner == "drop_punctuation":
        if six.PY3:
            return apply(
                s, lambda x: x.translate(str.maketrans("", "", string.punctuation))
            )
        return apply(s, lambda x: x.translate(None, string.punctuation))
    elif cleaner == "stopwords":
        stopwords = cfg.get("stopwords") or []

        def clean_stopwords(x):
            return " ".join([w for w in x.split(" ") if w not in stopwords])

        return apply(s, clean_stopwords)
    elif cleaner == "nltk_stopwords":
        language = cfg.get("language") or "english"
        try:
            import nltk

            nltk.download("stopwords")
            nltk.download("punkt")

            nltk_stopwords_set = set(nltk.corpus.stopwords.words(language))

            def clean_nltk_stopwords(x):
                return " ".join(
                    [
                        w
                        for w in nltk.tokenize.word_tokenize(x)
                        if w not in nltk_stopwords_set
                    ]
                )

            return apply(s.fillna(""), clean_nltk_stopwords)
        except ImportError:
            raise Exception(
                "You must install the 'nltk' package in order to use this cleaner!"
            )
    elif cleaner == "drop_numbers":
        return s.str.replace(r"[0-9]+", "")
    elif cleaner == "keep_alpha":
        return apply(s, lambda x: "".join(c for c in x if c.isalpha()))
    elif cleaner == "normalize_accents":
        return apply(
            s,
            lambda x: unicodedata.normalize("NFKD", u"{}".format(x))
            .encode("ASCII", "ignore")
            .decode("utf-8"),
        )
    elif cleaner == "drop_all_space":
        return s.str.replace(r"[ ]+", "")
    elif cleaner == "drop_repeated_words":

        def drop_repeats(val):
            def _load():
                val_segs = val.split(" ")
                for i, v2 in enumerate(val_segs):
                    if i == 0:
                        yield v2
                    elif val_segs[i - 1] != v2:
                        yield v2

            return " ".join(list(_load()))

        return apply(s, drop_repeats)
    elif cleaner == "add_word_number_space":
        return s.str.replace(r"(\d+(\.\d+)?)", r" \1 ")
    elif cleaner == "drop_repeated_chars":

        def drop_repeats(val):
            def _load():
                for i, v2 in enumerate(val):
                    if i == 0:
                        yield v2
                    elif val[i - 1] != v2:
                        yield v2

            return "".join(list(_load()))

        return apply(s, drop_repeats)
    elif cleaner == "update_case":
        case = cfg.get("caseType")
        return getattr(s.str, case)()
Example #8
0
    def build_column(self, data):
        col, from_type, to_type = (self.cfg.get(p) for p in ["col", "from", "to"])
        s = data[col]
        classifier = classify_type(from_type)
        if (
            classifier == "S"
        ):  # col can be (str or category) -> date, int, float, bool, category
            if to_type == "date":
                date_kwargs = {}
                if self.cfg.get("fmt"):
                    date_kwargs["format"] = self.cfg["fmt"]
                else:
                    date_kwargs["infer_datetime_format"] = True
                return pd.Series(
                    pd.to_datetime(s, **date_kwargs), name=self.name, index=s.index
                )
            elif to_type == "int":
                if s.str.startswith("0x").any():

                    def str_hex_to_int(v):
                        return v if pd.isnull(v) else int(v, base=16)

                    return pd.Series(
                        apply(s, str_hex_to_int), name=self.name, index=s.index
                    )
                return pd.Series(
                    s.astype("float").astype("int"), name=self.name, index=s.index
                )
            elif to_type == "float":
                if s.str.startswith("0x").any():
                    return pd.Series(
                        apply(s, float.fromhex), name=self.name, index=s.index
                    )
                return pd.Series(
                    pd.to_numeric(s, errors="coerce"), name=self.name, index=s.index
                )
            else:
                if from_type.startswith("mixed"):
                    if to_type == "float":
                        return pd.Series(
                            pd.to_numeric(s, errors="coerce"),
                            name=self.name,
                            index=s.index,
                        )
                    elif to_type == "bool":

                        def _process_mixed_bool(v):
                            if isinstance(v, bool):
                                return v
                            if isinstance(v, six.string_types):
                                return dict(true=True, false=False).get(
                                    v.lower(), np.nan
                                )
                            return np.nan

                        return pd.Series(
                            apply(s, _process_mixed_bool), name=self.name, index=s.index
                        )
                return pd.Series(s.astype(to_type), name=self.name, index=s.index)
        elif classifier == "I":  # date, float, category, str, bool
            if to_type == "date":
                unit = self.cfg.get("unit") or "D"
                if unit == "YYYYMMDD":
                    return pd.Series(
                        apply(s.astype("str"), pd.Timestamp),
                        name=self.name,
                        index=s.index,
                    )
                return pd.Series(
                    pd.to_datetime(s, unit=unit), name=self.name, index=s.index
                )
            elif to_type == "hex":

                def int_to_hex(v):
                    return v if pd.isnull(v) else hex(v)

                return pd.Series(apply(s, int_to_hex), name=self.name, index=s.index)
            return pd.Series(s.astype(to_type), name=self.name, index=s.index)
        elif classifier == "F":  # str, int
            if to_type == "hex":
                return pd.Series(apply(s, float.hex), name=self.name, index=s.index)
            return pd.Series(s.astype(to_type), name=self.name, index=s.index)
        elif classifier == "D":  # str, int
            if to_type == "int":
                unit = self.cfg.get("unit")
                if unit == "YYYYMMDD":
                    return pd.Series(
                        s.dt.strftime("%Y%m%d").astype(int),
                        name=self.name,
                        index=s.index,
                    )
                return pd.Series(
                    apply(s, lambda x: time.mktime(x.timetuple())).astype(int)
                )
            return pd.Series(
                s.dt.strftime(self.cfg.get("fmt") or "%Y%m%d"),
                name=self.name,
                index=s.index,
            )
        elif classifier == "B":
            return pd.Series(s.astype(to_type), name=self.name, index=s.index)
        raise NotImplementedError(
            "data type conversion not supported for dtype: {}".format(from_type)
        )