Beispiel #1
0
    def train_and_save_on_tokens(self,
                                 tokens: List[str],
                                 save_path: str,
                                 settings: DetectingSettings,
                                 train_sample_df: pandas.DataFrame,
                                 punc_set: str = ".,/-",
                                 symbol_set: Optional[str] = None,
                                 string_checks: bool = False,
                                 compress: bool = False):
        self.model = BaseTokenSequenceClassifierModel.get_classifier(
            settings.use_spacy,
            pre_window=settings.pre_window, post_window=settings.post_window,
            match_tokens=tokens, letter_set=string.ascii_letters,
            digit_set=string.digits, punc_set=punc_set, symbol_set=symbol_set,
            string_checks=string_checks)

        # build feature and target training sample
        train_feature_data, train_target_data = self.process_sample(
            train_sample_df, build_target_data=True, )
        # initialize sklearn model based on request
        if settings.model_type == 'extra_trees':
            import sklearn.ensemble

            model = sklearn.ensemble.ExtraTreesClassifier(class_weight="balanced")
        elif settings.model_type == 'random_forest':
            import sklearn.ensemble

            model = sklearn.ensemble.RandomForestClassifier(class_weight="balanced")
        # train
        self.model.train_model(model, train_feature_data, train_target_data)
        if compress:
            self.save_compressed_model(save_path)
        else:
            self.save_model(save_path)
Beispiel #2
0
def process_sample(
    sample_df: pandas.DataFrame,
    s: BaseTokenSequenceClassifierModel,
    build_target_data: bool = True,
    pre_alloc_multiple: int = 30,
    column_name_formatted: str = 'quantity_formatted',
    outer_class: int = 0,
    start_class: int = 1,
    inner_class: int = 2,
    end_class: int = 3,
    get_target_start_end: Callable[[str, str, Any], List[Tuple[
        int, int]]] = get_target_start_end_from_text,
    feature_mask_column: Optional[str] = None
) -> Union[numpy.ndarray, Tuple[numpy.ndarray, numpy.ndarray]]:
    """
    Process a sample file to create feature and target data.
    :param sample_df: dataframe with at least 'sentence' column
    :param s: TokenSequenceClassifierModel or SpacyTokenSequenceClassifierModel
    :param build_target_data: build target data vector (if true)
    :param pre_alloc_multiple:
    :param column_name_formatted: "quantity_formatted" or "noun_phrase_formatted" ...
    :param outer_class:
    :param start_class:
    :param inner_class:
    :param end_class:
    :return: (feature_data, target_data) if build_target_data = True or just feature_data
    """

    # pre-allocate feature data approximately based on conservative sentence token count
    num_token_guess = sample_df.shape[0] * pre_alloc_multiple
    num_token = 0
    feature_data = numpy.zeros((num_token_guess, len(s.feature_list)),
                               dtype=numpy.int8)
    if build_target_data:
        target_data = numpy.zeros((num_token_guess, ))

    # iterate through rows
    for row_id, row in sample_df.iterrows():
        # set key variables
        text = row["sentence"]
        if build_target_data:
            # quantity_formatted or noun_phrase_formatted
            entity_coords = get_target_start_end(text, column_name_formatted,
                                                 row)

        # set feature rows
        feature_mask = row[feature_mask_column] if feature_mask_column else None
        row_feature_data, row_tokens = s.get_feature_data(
            text, feature_mask=feature_mask)
        row_num_tokens = row_feature_data.shape[0]

        # check if we are within initial allocation
        if num_token + row_num_tokens <= feature_data.shape[0]:
            feature_data[num_token:(num_token +
                                    row_num_tokens), :] = row_feature_data
        else:
            # handle resize for both feature and target data if required
            rescale_multiple = sample_df.shape[0] / float(row_id)
            rescale_size = int(
                numpy.ceil(feature_data.shape[0] * rescale_multiple))
            feature_data.resize((rescale_size, feature_data.shape[1]),
                                refcheck=False)
            if build_target_data:
                target_data.resize((rescale_size, ), refcheck=False)
            feature_data[num_token:(num_token +
                                    row_num_tokens), :] = row_feature_data

        # set target vector entries
        if build_target_data:
            for i in range(row_num_tokens):
                token_start, token_end = row_tokens[i]

                target_data[num_token + i] = outer_class
                for start_pos, end_pos in entity_coords:
                    if token_start <= start_pos < token_end:
                        target_data[num_token + i] = start_class
                    elif token_start < end_pos <= token_end:
                        target_data[num_token + i] = end_class
                    elif start_pos < token_end and token_start < end_pos:
                        target_data[num_token + i] = inner_class

        num_token += row_num_tokens

    if build_target_data:
        return feature_data[0:num_token], target_data[0:num_token]
    return feature_data[0:num_token]
Beispiel #3
0
 def load_from_stream(self, stream: Any):
     self.model = BaseTokenSequenceClassifierModel.load_from_stream(stream)
Beispiel #4
0
 def load_compressed(self, file_path: str):
     self.model = BaseTokenSequenceClassifierModel.load_from_file_compressed(
         file_path)