def get_target_sentiment_distribution(dataset: TargetTextCollection) -> Dict[str, float]: target_sentiment_distribution = Counter() for target_text in dataset.values(): target_sentiment_distribution.update(target_text['target_sentiments']) for key, value in target_sentiment_distribution.items(): target_sentiment_distribution[key] = round((value / dataset.number_targets()), 2) * 100 return dict(target_sentiment_distribution)
def average_target_per_sentences(collection: TargetTextCollection, sentence_must_contain_targets: bool) -> float: ''' :param collection: Collection to calculate average target per sentence (ATS) on. :param sentence_must_contain_targets: Whether or not the sentences within the collection must contains at least one target. This filtering would affect the value of the dominator stated in the returns. :returns: The ATS for the given collection. Which is: Number of targets / number of sentences ''' number_targets = float(collection.number_targets()) if sentence_must_contain_targets: number_sentences = len(collection.samples_with_targets()) else: number_sentences = len(collection) return number_targets / float(number_sentences)
def get_sentiment_counts(collection: TargetTextCollection, sentiment_key: str, normalised: bool = True) -> Dict[str, float]: ''' :param collection: The collection containing the sentiment data :param sentiment_key: The key in each TargetText within the collection that contains the True sentiment value. :param normalised: Whether to normalise the values in the dictionary by the number of targets in the collection. :returns: A dictionary where keys are sentiment values and the keys are the number of times they occur in the collection. ''' sentiment_count = defaultdict(lambda: 0) for target_text in collection.values(): if target_text[sentiment_key] is not None: for sentiment_value in target_text[sentiment_key]: sentiment_count[sentiment_value] += 1 number_targets = collection.number_targets() assert number_targets == sum(sentiment_count.values()) if normalised: for sentiment, count in sentiment_count.items(): sentiment_count[sentiment] = float(count) / float(number_targets) return dict(sentiment_count)
def dataset_length(task: str, dataset: TargetTextCollection) -> int: if task == 'extraction': return len(dataset) elif task == 'sentiment': return dataset.number_targets() return 0
all_targets.append(aug_target_object) except OverLappingTargetsError: # This needs to be skipped as when targets overlap it is very # difficult to easily calculate all possible span offsets # for all other targets. Furthermore there are only 3 # occasion this happens so it is a very rare occurrence. continue return all_targets if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("augmented_dataset", type=parse_path, help='File path the augmented dataset') parser.add_argument("save_fp", type=parse_path, help='File path to save the new re-formated augmented dataset') args = parser.parse_args() augmented_data_fp = args.augmented_dataset save_fp = args.save_fp augmented_dataset = TargetTextCollection.load_json(augmented_data_fp) new_dataset = [] for target_object in augmented_dataset.values(): augmented_targets = add_augmented_targets(target_object, remove_repeats=True) new_dataset.extend(augmented_targets) new_dataset = TargetTextCollection(new_dataset) number_samples = new_dataset.number_targets() print(f'The number of samples in the dataset {number_samples}') new_dataset.to_json_file(save_fp)