def get_target_sentiment_distribution(dataset: TargetTextCollection) -> Dict[str, float]:
    target_sentiment_distribution = Counter()
    for target_text in dataset.values():
        target_sentiment_distribution.update(target_text['target_sentiments'])
    for key, value in target_sentiment_distribution.items():
        target_sentiment_distribution[key] = round((value / dataset.number_targets()), 2) * 100
    return dict(target_sentiment_distribution)
Exemple #2
0
def average_target_per_sentences(collection: TargetTextCollection, 
                                 sentence_must_contain_targets: bool) -> float:
    '''
    :param collection: Collection to calculate average target per sentence (ATS) 
                       on.
    :param sentence_must_contain_targets: Whether or not the sentences within the 
                                          collection must contains at least one 
                                          target. This filtering would affect 
                                          the value of the dominator stated in 
                                          the returns.  
    :returns: The ATS for the given collection. Which is: 
              Number of targets / number of sentences
    '''
    number_targets = float(collection.number_targets())
    if sentence_must_contain_targets:
        number_sentences = len(collection.samples_with_targets())
    else:
        number_sentences = len(collection)
    return number_targets / float(number_sentences)
Exemple #3
0
def get_sentiment_counts(collection: TargetTextCollection,
                         sentiment_key: str,
                         normalised: bool = True) -> Dict[str, float]:
    '''
    :param collection: The collection containing the sentiment data
    :param sentiment_key: The key in each TargetText within the collection that 
                          contains the True sentiment value.
    :param normalised: Whether to normalise the values in the dictionary 
                       by the number of targets in the collection.
    :returns: A dictionary where keys are sentiment values and the keys 
              are the number of times they occur in the collection.
    '''
    sentiment_count = defaultdict(lambda: 0)
    for target_text in collection.values():
        if target_text[sentiment_key] is not None:
            for sentiment_value in target_text[sentiment_key]:
                sentiment_count[sentiment_value] += 1
    number_targets = collection.number_targets()
    assert number_targets == sum(sentiment_count.values())
    if normalised:
        for sentiment, count in sentiment_count.items():
            sentiment_count[sentiment] = float(count) / float(number_targets)
    return dict(sentiment_count)
def dataset_length(task: str, dataset: TargetTextCollection) -> int:
    if task == 'extraction':
        return len(dataset)
    elif task == 'sentiment':
        return dataset.number_targets()
    return 0
                all_targets.append(aug_target_object)
            except OverLappingTargetsError:
                # This needs to be skipped as when targets overlap it is very 
                # difficult to easily calculate all possible span offsets 
                # for all other targets. Furthermore there are only 3 
                # occasion this happens so it is a very rare occurrence.
                continue
    return all_targets

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("augmented_dataset", type=parse_path, 
                        help='File path the augmented dataset')
    parser.add_argument("save_fp", type=parse_path, 
                        help='File path to save the new re-formated augmented dataset')
    args = parser.parse_args()

    augmented_data_fp = args.augmented_dataset
    save_fp = args.save_fp

    augmented_dataset = TargetTextCollection.load_json(augmented_data_fp)
    new_dataset = []

    for target_object in augmented_dataset.values():
        augmented_targets = add_augmented_targets(target_object, 
                                                  remove_repeats=True)
        new_dataset.extend(augmented_targets)
    new_dataset = TargetTextCollection(new_dataset)
    number_samples = new_dataset.number_targets()
    print(f'The number of samples in the dataset {number_samples}')
    new_dataset.to_json_file(save_fp)