def _prepare_df_for_cleaning(self, df: pd.DataFrame, text_column: AnyStr,
                                 language_column: AnyStr,
                                 language: AnyStr) -> None:
        """Private method to prepare a Pandas dataframe in-place before feeding it to the `self.clean_df` method

        Tokenizes the content of the text column into a new column containing spaCy documents
        Adds new columns to hold the future outputs of the cleaner method

        Args:
            df: Input pandas DataFrame
            text_column: Name of the column containing text data
            language_column: Name of the column with language codes in ISO 639-1 format
            language: Language code in ISO 639-1 format
                If equal to "language_column" this parameter is ignored in favor of language_column

        """
        self.output_column_descriptions = {}
        for k, v in self.OUTPUT_COLUMN_DESCRIPTIONS.items():
            if k == "cleaned":
                column_name = generate_unique(k, df.keys(), text_column)
                self.output_column_descriptions[column_name] = v
            elif k in self.token_filters and self.keep_filtered_tokens:
                column_name = generate_unique(f"{v.lower()}s", df.keys(),
                                              text_column)
                self.output_column_descriptions[
                    column_name] = f"{v}s in the original text"
        self.tokenizer.tokenize_df(df, text_column, language_column, language)
 def __init__(
     self,
     input_df: pd.DataFrame,
     input_folder: dataiku.Folder = None,
     minimum_score: float = 0.0,
     orientation_correction: bool = True,
     column_prefix: AnyStr = "text_api",
     error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG,
     parallel_workers: int = DEFAULT_PARALLEL_WORKERS,
 ):
     super().__init__(
         input_df=input_df,
         input_folder=input_folder,
         column_prefix=column_prefix,
         error_handling=error_handling,
         parallel_workers=parallel_workers,
     )
     self.minimum_score = float(minimum_score)
     self.orientation_correction = bool(orientation_correction)
     self.orientation_column = generate_unique("orientation_correction",
                                               input_df.keys(),
                                               column_prefix)
     self.text_column_list = generate_unique("detections_list",
                                             input_df.keys(), column_prefix)
     self.text_column_concat = generate_unique("detections_concat",
                                               input_df.keys(),
                                               column_prefix)
     self._compute_column_description()
 def __init__(
     self,
     input_df: pd.DataFrame,
     category_level:
     UnsafeContentCategoryLevelEnum = UnsafeContentCategoryLevelEnum.TOP,
     content_categories_top_level: List[
         UnsafeContentCategoryTopLevelEnum] = [],
     content_categories_second_level: List[
         UnsafeContentCategorySecondLevelEnum] = [],
     column_prefix: AnyStr = "moderation_api",
     error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG,
 ):
     super().__init__(
         input_df=input_df,
         column_prefix=column_prefix,
         error_handling=error_handling,
     )
     self.category_level = category_level
     if self.category_level == UnsafeContentCategoryLevelEnum.TOP:
         self.content_category_enum = UnsafeContentCategoryTopLevelEnum
         self.content_categories = content_categories_top_level
     else:
         self.content_category_enum = UnsafeContentCategorySecondLevelEnum
         self.content_categories = content_categories_second_level
     self.is_unsafe_column = generate_unique("unsafe_content",
                                             self.input_df.keys(),
                                             self.column_prefix)
     self.unsafe_list_column = generate_unique("unsafe_categories",
                                               self.input_df.keys(),
                                               self.column_prefix)
     self._compute_column_description()
 def __init__(
     self,
     input_df: pd.DataFrame,
     num_objects: int,
     orientation_correction: bool = True,
     input_folder: dataiku.Folder = None,
     column_prefix: AnyStr = "object_api",
     error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG,
     parallel_workers: int = DEFAULT_PARALLEL_WORKERS,
 ):
     super().__init__(
         input_df=input_df,
         input_folder=input_folder,
         column_prefix=column_prefix,
         error_handling=error_handling,
         parallel_workers=parallel_workers,
     )
     self.num_objects = int(num_objects)
     self.orientation_correction = bool(orientation_correction)
     self.orientation_column = generate_unique("orientation_correction",
                                               input_df.keys(),
                                               column_prefix)
     self.label_list_column = generate_unique("label_list", input_df.keys(),
                                              column_prefix)
     self.label_name_columns = [
         generate_unique("label_" + str(n + 1) + "_name", input_df.keys(),
                         column_prefix) for n in range(num_objects)
     ]
     self.label_score_columns = [
         generate_unique("label_" + str(n + 1) + "_score", input_df.keys(),
                         column_prefix) for n in range(num_objects)
     ]
     self._compute_column_description()
 def _compute_column_description(self):
     """Compute output column names and descriptions"""
     self.score_column = generate_unique("score", self.input_df.keys(), self.column_prefix)
     self.column_description_dict[self.score_column] = "Confidence score in the crop hint from 0 to 1"
     self.importance_column = generate_unique("importance_fraction", self.input_df.keys(), self.column_prefix)
     self.column_description_dict[
         self.importance_column
     ] = "Importance of the crop hint with respect to the original image from 0 to 1"
 def _compute_column_description(self):
     """Compute output column names and descriptions"""
     self.text_column_concat = generate_unique("detections_concat", self.input_df.keys(), self.column_prefix)
     self.column_description_dict[self.text_column_concat] = "Concatenated text detections from the API"
     self.language_code_column = generate_unique("language_code", self.input_df.keys(), self.column_prefix)
     self.column_description_dict[self.language_code_column] = "Detected language code from the API"
     self.language_score_column = generate_unique("language_score", self.input_df.keys(), self.column_prefix)
     self.column_description_dict[
         self.language_score_column
     ] = "Confidence score in the detected language from 0 to 1"
 def __init__(
     self,
     input_df: pd.DataFrame,
     sentiment_scale: AnyStr = "ternary",
     column_prefix: AnyStr = "sentiment_api",
     error_handling: ErrorHandlingEnum = ErrorHandlingEnum.LOG,
 ):
     super().__init__(input_df, column_prefix, error_handling)
     self.sentiment_scale = sentiment_scale
     self.sentiment_score_column = generate_unique("score", input_df.keys(), self.column_prefix)
     self.sentiment_score_scaled_column = generate_unique("score_scaled", input_df.keys(), column_prefix)
     self.sentiment_magnitude_column = generate_unique("magnitude", input_df.keys(), column_prefix)
     self._compute_column_description()
 def _compute_column_description(self):
     for n in range(self.num_categories):
         category_column = generate_unique(
             "category_" + str(n + 1) + "_name", self.input_df.keys(), self.column_prefix,
         )
         confidence_column = generate_unique(
             "category_" + str(n + 1) + "_confidence", self.input_df.keys(), self.column_prefix,
         )
         self.column_description_dict[category_column] = "Name of the category {} representing the document".format(
             str(n + 1)
         )
         self.column_description_dict[confidence_column] = "Classifier's confidence in the category {}".format(
             str(n + 1)
         )
    def clean_df(
        self,
        df: pd.DataFrame,
        text_column: AnyStr,
        language_column: AnyStr = "",
        language: AnyStr = "language_column",
    ) -> pd.DataFrame:
        """Public method to clean a text column in a pandas DataFrame, given language information

        Prepare the dataframe with `self._prepare_df_for_cleaning` to obtain a new column with spaCy documents
        Run `self.clean_document` on all documents with multithreading
        Format the output dataframe

        Args:
            df: Input pandas DataFrame
            text_column: Name of the column containing text data
            language_column: Name of the column with language codes in ISO 639-1 format
            language: Language code in ISO 639-1 format
                If equal to "language_column" this parameter is ignored in favor of language_column

        Returns:
            Input dataframe with new columns at the end:
                - Cleaned text after filter, lemmatization, lowercase and unicode normalization steps
                - One column for each selected `self.token_filters` with a concatenation of filtered tokens

        """
        self._prepare_df_for_cleaning(df, text_column, language_column,
                                      language)
        start = perf_counter()
        logging.info(f"Cleaning {len(df.index)} document(s)...")
        output = [{}] * len(df.index)
        doc_iterator = (doc for doc in df[self.tokenizer.tokenized_column])
        with ThreadPoolExecutor(
                max_workers=self.DEFAULT_NUM_THREADS) as executor:
            output = list(
                executor.map(lambda x: self.clean_document(x), doc_iterator))
        for k, v in self.OUTPUT_COLUMN_DESCRIPTIONS.items():
            if k == "cleaned":
                column_name = generate_unique(k, df.keys(), text_column)
                df[column_name] = [d.get(k, "") for d in output]
            elif k in self.token_filters and self.keep_filtered_tokens:
                column_name = generate_unique(f"{v.lower()}s", df.keys(),
                                              text_column)
                df[column_name] = [d.get(k, "") for d in output]
        logging.info(
            f"Cleaning {len(df.index)} document(s): done in {perf_counter() - start:.2f} seconds"
        )
        del df[self.tokenizer.tokenized_column]
        return df
 def format_row(self, row: Dict) -> Dict:
     raw_response = row[self.api_column_names.response]
     response = safe_json_loads(raw_response, self.error_handling)
     moderation_labels = response.get("ModerationLabels", [])
     row[self.is_unsafe_column] = False
     row[self.unsafe_list_column] = ""
     unsafe_list = []
     for category in self.content_categories:
         confidence_column = generate_unique(
             category.name.lower() + "_score", self.input_df.keys(),
             self.column_prefix)
         row[confidence_column] = ""
         if self.category_level == UnsafeContentCategoryLevelEnum.TOP:
             scores = [
                 l.get("Confidence") for l in moderation_labels
                 if l.get("ParentName", "") == category.value
             ]
         else:
             scores = [
                 l.get("Confidence") for l in moderation_labels
                 if l.get("Name", "") == category.value
             ]
         if len(scores) != 0:
             unsafe_list.append(str(category.value))
             row[confidence_column] = scores[0]
     if len(unsafe_list) != 0:
         row[self.is_unsafe_column] = True
         row[self.unsafe_list_column] = unsafe_list
     return row
 def _compute_column_description(self):
     """Compute output column names and descriptions"""
     for name, member in UnsafeContentCategory.__members__.items():
         category_column = generate_unique(name.lower() + "_likelihood", self.input_df.keys(), self.column_prefix)
         self.column_description_dict[
             category_column
         ] = f"Likelihood of category '{member.value}' from 1 (VERY_UNLIKELY) to 5 (VERY_LIKELY)"
 def format_row(self, row: Dict) -> Dict:
     raw_response = row[self.api_column_names.response]
     response = safe_json_loads(raw_response, self.error_handling)
     categories = sorted(response.get("categories", []), key=lambda x: x.get("confidence"), reverse=True,)
     for n in range(self.num_categories):
         category_column = generate_unique("category_" + str(n + 1) + "_name", row.keys(), self.column_prefix)
         confidence_column = generate_unique(
             "category_" + str(n + 1) + "_confidence", row.keys(), self.column_prefix
         )
         if len(categories) > n:
             row[category_column] = categories[n].get("name", "")
             row[confidence_column] = categories[n].get("confidence")
         else:
             row[category_column] = ""
             row[confidence_column] = None
     return row
Beispiel #13
0
 def _compute_column_description(self):
     for n, m in EntityTypeEnum.__members__.items():
         entity_type_column = generate_unique("entity_type_" + n.lower(),
                                              self.input_df.keys(),
                                              self.column_prefix)
         self.column_description_dict[
             entity_type_column] = "List of '{}' entities recognized by the API".format(
                 str(m.value))
 def _compute_column_description(self):
     """Compute output column names and descriptions"""
     if vision.Feature.Type.LABEL_DETECTION in self.content_categories:
         self.label_list_column = generate_unique("label_list", self.input_df.keys(), self.column_prefix)
         self.column_description_dict[self.label_list_column] = "List of labels from the API"
     if vision.Feature.Type.OBJECT_LOCALIZATION in self.content_categories:
         self.object_list_column = generate_unique("object_list", self.input_df.keys(), self.column_prefix)
         self.column_description_dict[self.object_list_column] = "List of objects from the API"
     if vision.Feature.Type.LANDMARK_DETECTION in self.content_categories:
         self.landmark_list_column = generate_unique("landmark_list", self.input_df.keys(), self.column_prefix)
         self.column_description_dict[self.landmark_list_column] = "List of landmarks from the API"
     if vision.Feature.Type.LOGO_DETECTION in self.content_categories:
         self.logo_list_column = generate_unique("logo_list", self.input_df.keys(), self.column_prefix)
         self.column_description_dict[self.logo_list_column] = "List of logos from the API"
     if vision.Feature.Type.WEB_DETECTION in self.content_categories:
         self.web_label_column = generate_unique("web_label", self.input_df.keys(), self.column_prefix)
         self.column_description_dict[self.web_label_column] = "Web label from the API"
         self.web_entity_list_column = generate_unique("web_entity_list", self.input_df.keys(), self.column_prefix)
         self.column_description_dict[self.web_entity_list_column] = "List of Web entities from the API"
         self.web_full_matching_image_list_column = generate_unique(
             "web_full_matching_image_list", self.input_df.keys(), self.column_prefix
         )
         self.column_description_dict[
             self.web_full_matching_image_list_column
         ] = "List of Web images fully matching the input image"
         self.web_partial_matching_image_list_column = generate_unique(
             "web_partial_matching_image_list", self.input_df.keys(), self.column_prefix
         )
         self.column_description_dict[
             self.web_partial_matching_image_list_column
         ] = "List of Web images partially matching the input image"
         self.web_page_match_list_column = generate_unique(
             "web_page_match_list", self.input_df.keys(), self.column_prefix
         )
         self.column_description_dict[
             self.web_page_match_list_column
         ] = "List of Web pages with images matching the input image"
         self.web_similar_image_list_column = generate_unique(
             "web_similar_image_list", self.input_df.keys(), self.column_prefix
         )
         self.column_description_dict[
             self.web_similar_image_list_column
         ] = "List of Web images visually similar to the input image"
 def format_row(self, row: Dict) -> Dict:
     """Extract the likelihood of each unsafe content category from a row with an API response"""
     raw_response = row[self.api_column_names.response]
     response = safe_json_loads(raw_response, self.error_handling)
     moderation_labels = response.get("safeSearchAnnotation", {})
     for category in self.unsafe_content_categories:
         category_column = generate_unique(
             category.name.lower() + "_likelihood", self.input_df.keys(), self.column_prefix
         )
         row[category_column] = moderation_labels.get(category.name.lower(), "")
     return row
Beispiel #16
0
    def tokenize_df(self,
                    df: pd.DataFrame,
                    text_column: AnyStr,
                    language_column: AnyStr = "",
                    language: AnyStr = "language_column") -> pd.DataFrame:
        """Public method to tokenize a text column in a pandas DataFrame, given language information

        This methods adds a new column to the DataFrame, whose name is saved as the `tokenized_column` attribute

        Args:
            df: Input pandas DataFrame
            text_column: Name of the column containing text data
            language_column: Name of the column with language codes in ISO 639-1 format
            language: Language code in ISO 639-1 format, cf. https://spacy.io/usage/models#languages
                if equal to "language_column" this parameter is ignored in favor of language_column

        Returns:
            DataFrame with all columns from the input, plus a new column with tokenized spaCy documents

        """
        self.tokenized_column = generate_unique("tokenized", df.keys(),
                                                text_column)
        # Initialize the tokenized column to empty documents
        df[self.tokenized_column] = pd.Series([Doc(Vocab())] * len(df.index),
                                              dtype="object")
        if language == "language_column":
            languages = df[language_column].dropna().unique()
            unsupported_languages = set(languages) - set(
                SUPPORTED_LANGUAGES_SPACY.keys())
            if unsupported_languages:
                raise TokenizationError(
                    f"Found {len(unsupported_languages)} unsupported languages in dataset: {unsupported_languages}"
                )
            for lang in languages:  # iterate over languages
                language_indices = df[language_column] == lang
                text_slice = df.loc[
                    language_indices,
                    text_column]  # slicing input df by language
                if len(text_slice) != 0:
                    tokenized_list = self.tokenize_list(text_list=text_slice,
                                                        language=lang)
                    df.loc[
                        language_indices, self.tokenized_column] = pd.Series(
                            tokenized_list,
                            dtype="object",
                            index=text_slice.index,  # keep index (important)
                        )
        else:
            tokenized_list = self.tokenize_list(text_list=df[text_column],
                                                language=language)
            df[self.tokenized_column] = tokenized_list
        return df
 def _compute_column_description(self):
     self.column_description_dict[
         self.is_unsafe_column] = "Unsafe content detected by the API"
     self.column_description_dict[
         self.
         unsafe_list_column] = "List of unsafe content categories detected by the API"
     for n, m in self.content_category_enum.__members__.items():
         confidence_column = generate_unique(n.lower() + "_score",
                                             self.input_df.keys(),
                                             self.column_prefix)
         self.column_description_dict[
             confidence_column] = "Confidence score in category '{}' from 0 to 1".format(
                 m.value)
Beispiel #18
0
 def detect_languages_df(self, df: pd.DataFrame,
                         text_column: AnyStr) -> pd.DataFrame:
     self.column_description_dict = OrderedDict()
     for k, v in self.COLUMN_DESCRIPTION_DICT.items():
         self.column_description_dict[generate_unique(
             k, df.keys(), text_column)] = v
     doc_iterator = (doc
                     for _, doc in df[text_column].astype(str).iteritems())
     output_df = df.copy()
     with ThreadPoolExecutor(max_workers=self.NUM_THREADS) as executor:
         lang_output_tuple_list = list(
             executor.map(self.detect_language_doc, doc_iterator))
     for i, col in enumerate(self.column_description_dict.keys()):
         output_df[col] = [t[i] for t in lang_output_tuple_list]
     return output_df
 def format_row(self, row: Dict) -> Dict:
     raw_response = row[self.api_column_names.response]
     response = safe_json_loads(raw_response, self.error_handling)
     entities = response.get("entities", [])
     selected_entity_types = sorted([e.name for e in self.entity_types])
     for n in selected_entity_types:
         entity_type_column = generate_unique("entity_type_" + n.lower(), row.keys(), self.column_prefix)
         row[entity_type_column] = [
             e.get("name")
             for e in entities
             if e.get("type", "") == n and float(e.get("salience", 0)) >= self.minimum_score
         ]
         if len(row[entity_type_column]) == 0:
             row[entity_type_column] = ""
     return row
Beispiel #20
0
 def detect_languages_df(self, df: pd.DataFrame,
                         text_column: AnyStr) -> pd.DataFrame:
     """Apply the `detect_language_doc` method to a pandas DataFrame with a text column, with multithreading"""
     self.column_descriptions = {}
     for k, v in self.COLUMN_DESCRIPTIONS.items():
         self.column_descriptions[generate_unique(k, df.keys(),
                                                  text_column)] = v
     doc_iterator = (doc
                     for _, doc in df[text_column].astype(str).iteritems())
     output_df = df.copy()
     with ThreadPoolExecutor(max_workers=self.NUM_THREADS) as executor:
         lang_output_tuple_list = list(
             executor.map(self.detect_language_doc, doc_iterator))
     for i, col in enumerate(self.column_descriptions):
         output_df[col] = [t[i] for t in lang_output_tuple_list]
     return output_df
Beispiel #21
0
    def _prepare_df_for_spellchecker(self, df: pd.DataFrame,
                                     text_column: AnyStr,
                                     language_column: AnyStr,
                                     language: AnyStr) -> None:
        """Private method to prepare a Pandas dataframe in-place before feeding it to the spellchecker

        Tokenize the content of the text column into a new column containing spaCy documents
        Add new columns to hold the future outputs of the spellchecker

        Args:
            df: Input pandas DataFrame
            text_column: Name of the column containing text data
            language_column: Name of the column with language codes in ISO 639-1 format
            language: Language code in ISO 639-1 format
                If equal to "language_column" this parameter is ignored in favor of language_column

        """
        self.output_column_descriptions = {}
        for k, v in self.OUTPUT_COLUMN_DESCRIPTIONS.items():
            column_name = generate_unique(k, df.keys(), text_column)
            df[column_name] = pd.Series([""] * len(df.index))
            self.output_column_descriptions[column_name] = v
        self.tokenizer.tokenize_df(df, text_column, language_column, language)