def predict_into_collection( self, collection: TargetTextCollection, key_mapping: Dict[str, str], batch_size: Optional[int] = None, append_if_exists: bool = True) -> TargetTextCollection: ''' :param collection: The TargetTextCollection that is to be predicted on and to be the store of the predicted data. :param key_mapping: Dictionary mapping the prediction keys that contain the prediction values to the keys that will store those prediction values within the collection that has been predicted on. :param batch_size: Specify the batch size to predict on. If left None defaults to 64 unless it is specified in the `model_param_fp` within the constructor then the batch size from the param file is used. :param append_if_exists: If False and a TargetText within the collection already has a prediction within the given key based on the `key_mapping` then KeyError is raised. :returns: The collection that was predict on with the new predictions within the collection stored in keys that are the values of the `key_mapping` argument. Note that all predictions are sotred within Lists within their respective keys in the collection. :raises KeyError: If the keys from `key_mapping` is not within the prediction dictionary. :raises KeyError: If `append_if_exists` is False and the a TargetText within the collection already has a prediction within the given key based on the `key_mapping` then this is raised. ''' for prediction, original_target in self._predict_iter( collection.dict_iterator(), batch_size=batch_size, yield_original_target=True): text_id = original_target['text_id'] # This happens first as we want an error to be raised before any # data is added to the TargetTextCollection. for prediction_key, collection_key in key_mapping.items(): if prediction_key not in prediction: raise KeyError( f'The key {prediction_key} from `key_mapping`' f' {key_mapping} is not within the prediction' f' {prediction} for the follwoing TargeText' f' {original_target}') for prediction_key, collection_key in key_mapping.items(): if collection_key not in collection[text_id]: collection[text_id][collection_key] = [] elif not append_if_exists: raise KeyError( f'The key {collection_key} from `key_mapping`' f' {key_mapping} already exists within the' f' follwoing TargeText {original_target}') collection[text_id][collection_key].append( prediction[prediction_key]) return collection
def text_classification_prediction(model: AllenNLPModel, dataset: TargetTextCollection, prediction_key: str) -> None: for value in model._predict_iter(dataset.dict_iterator(), yield_original_target=True): prediction_object, target_object = value predicted_sentiment = prediction_object['label'] true_sentiment = target_object['target_sentiments'] number_sentiments = len(true_sentiment) predicted_sentiment = [predicted_sentiment] * number_sentiments text_id = target_object['text_id'] if prediction_key not in dataset[text_id]: dataset[text_id][prediction_key] = [] dataset[text_id][prediction_key].append(predicted_sentiment)
def tokens_per_sentence(collection: TargetTextCollection, tokeniser: Callable[[str], List[str]]) -> Dict[int, int]: ''' :param collection: The collection to generate the statistic for. :param tokeniser: The tokenizer to use to split the sentences/texts into tokens. If the collection has already been tokenised then it will use the tokens in the `tokenized_text` key within each sample in the collection else it will produce the tokens within this function and save them to that key as well. For a module of comptabile tokenisers :py:mod:`target_extraction.tokenizers` :returns: A dictionary of sentence lengths and their frequency. **This is a defaultdict where the value will be 0 if the key does not exist.** ''' if_tokenised = 'tokenized_text' in next(collection.dict_iterator()) if not if_tokenised: collection.tokenize(tokeniser) length_count = defaultdict(lambda: 0) for target_text in collection.values(): length_count[len(target_text['tokenized_text'])] += 1 return length_count