Esempio n. 1
0
    def detect(df, max_avg_length=30, columns_ignore=list()):
        positive_semantic_types = set(["http://schema.org/Text"])
        cols_to_detect = HelperFunction.cols_to_clean(df,
                                                      positive_semantic_types)
        require_checking = list(
            set(cols_to_detect).difference(set(columns_ignore)))
        extends = {"columns_to_perform": [], "split_to": []}
        for one_column in require_checking:
            rows = df.iloc[:, one_column]
            filtered_rows = [
                len(str(row)) for row in rows if len(str(row)) > 0
            ]
            if len(filtered_rows) > 0:
                avg_len = sum(filtered_rows) / len(filtered_rows)
                if avg_len < max_avg_length:
                    if not NumAlphaParser.num_check(df.iloc[:, one_column]):
                        isnum_alpha = NumAlphaParser.is_num_alpha(
                            df.iloc[:, one_column])
                        if isnum_alpha:
                            result = NumAlphaParser.num_alpha_splitter(
                                df.iloc[:, one_column])
                            extends["columns_to_perform"].append(one_column)
                            extends["split_to"].append(len(result))

        return extends
Esempio n. 2
0
    def detect(df, columns_ignore=list()):
        positive_semantic_types = set(["http://schema.org/Text"])

        cols_to_detect = HelperFunction.cols_to_clean(df,
                                                      positive_semantic_types)
        require_checking = \
            list(set(cols_to_detect).difference(set(columns_ignore)))
        extends = {"columns_to_perform": [], "split_to": []}
        for one_column in require_checking:
            if PhoneParser.is_phone(df.iloc[:, one_column]):
                extends["columns_to_perform"].append(one_column)
        return extends
    def detect_date_columns(self, sampled_df, except_list=list()):
        """
        Detects date columns in the sampled_df and returns a list of column indices which have dates

        params:
        - sampled_df [DataFrame]: a sample of rows from the original dataframe for detecting dates
        - except_list [List]: list of column indices to be ignored
        """
        positive_semantic_types = set([
            "https://metadata.datadrivendiscovery.org/types/Time",
            "http://schema.org/Text"
        ])
        cols_to_detect = HelperFunction.cols_to_clean(sampled_df,
                                                      positive_semantic_types)

        date_cols = []
        for idx in cols_to_detect:
            if idx not in except_list:
                if self._parse_column(sampled_df, idx) is not None:
                    date_cols.append(idx)
        return date_cols