예제 #1
0
    def test_targetcoll_get(self):
        '''
        Test the __getitem__ function of TargetCollection
        '''

        target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                  'text with Iphone', 1)
        target_example_1 = Target([(1, 5)], '3', 'Iphone',
                                  'text with Iphone', 1)
        target_example_2 = Target([(1, 2)], '2', 'Iphone',
                                  'text with Iphone', 1)
        target_col = TargetCollection([target_example_0, target_example_1,
                                       target_example_2])

        # Test normal case
        self.assertEqual(target_example_2, target_col['2'], msg='Cannot access '\
                         'data using keys. key used {} collection {}'\
                         .format('2', target_col))
        self.assertEqual(target_example_1, target_col.get('3'), msg='Cannot '\
                         'access data using the get method.')
        self.assertEqual(None, target_col.get('5'), msg='Default value for '\
                         'get not working should be None not {}'\
                         .format(target_col.get('5')))
        # Test that it raises a KeyError when key does not exist
        with self.assertRaises(KeyError, msg='Should produce a key error when '\
                               'the data does not exist'):
            target_col['5']
예제 #2
0
def dong(file_path):
    '''
    Given file path to the
    `Li Dong <https://github.com/bluemonk482/tdparse/tree/master/data/lidong>`_
    sentiment data it will parse the data and return it as a list of dictionaries.

    :param file_path: File Path to the annotated data
    :type file_path: String
    :returns: A TargetCollection containing Target instances.
    :rtype: TargetCollection
    '''

    file_path = os.path.abspath(file_path)
    if not os.path.isfile(file_path):
        raise FileNotFoundError('This file does not exist {}'.format(file_path))
    file_name, _ = os.path.splitext(os.path.basename(file_path))
    sentiment_range = [-1, 0, 1]

    sentiment_data = TargetCollection()
    with open(file_path, 'r') as dong_file:
        sent_dict = {}
        for index, line in enumerate(dong_file):
            divisible = index + 1
            line = line.strip()
            if divisible % 3 == 1:
                sent_dict['text'] = line
            elif divisible % 3 == 2:
                sent_dict['target'] = line
            elif divisible % 3 == 0:
                sentiment = int(line)
                if sentiment not in sentiment_range:
                    raise ValueError('The sentiment has to be one of the '\
                                     'following values {} not {}'\
                                     .format(sentiment_range, sentiment))
                sent_dict['sentiment'] = int(line)
                text = sent_dict['text'].lower()
                target = sent_dict['target'].lower()
                offsets = [match.span() for match in re.finditer(target, text)]
                if len(target.split()) > 1:
                    joined_target = ''.join(target.split())
                    offsets.extend([match.span()
                                    for match in re.finditer(joined_target, text)])
                sent_dict['spans'] = offsets
                sent_id = file_name + str(len(sentiment_data))
                # Sentence ID is the same as the target as there is only one
                # target per sentence
                sent_dict['sentence_id'] = sent_id
                sent_dict['target_id'] = sent_id
                sent_target = Target(**sent_dict)
                sentiment_data.add(sent_target)
                sent_dict = {}
            else:
                raise Exception('Problem')
    return sentiment_data
예제 #3
0
    def test_targetcoll_add(self):
        '''
        Test the add function of TargetCollection
        '''

        target_col = TargetCollection()
        target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                  'text with Iphone', 1)
        target_example_1 = Target([(1, 5)], '3', 'Iphone',
                                  'text with Iphone', 1)
        # Ensure the normal case works
        target_col.add(target_example_0)
        self.assertEqual(target_col['1'], target_example_0, msg='Test that {}' \
                         ' has been added to {}'\
                         .format(target_example_0, target_col))

        with self.assertRaises(TypeError, msg='Should not be able to add a dict'):
            target_col.add({'target_id' : '2'})

        with self.assertRaises(ValueError, msg='Should not be able to add a '\
                               'Target that has no `id`'):
            del target_example_1['target_id']
            if 'target_id' in target_example_1:
                raise KeyError('{} should not contain `id` key'\
                .format(target_example_1))
            target_col.add(target_example_1)
예제 #4
0
def same_one_sentiment(test_dataset: TargetCollection,
                       train_dataset: TargetCollection,
                       lower: bool = True) -> List[str]:
    '''
    Given a test and train dataset will return all of the test dataset sample 
    ids that contain targets that have the same one sentiment label associated 
    to them in the train and test sets.

    :param test_dataset: Test TargetCollection
    :param train_dataset: Train TargetCollection
    :param lower: Whether to lower case the target words
    :returns: A list of sample ids from the test dataset.
    '''
    train_target_sentiments = target_sentiments(train_dataset, lower)
    test_target_sentiments = target_sentiments(test_dataset, lower)

    same_one_sentiments = set()
    for data in test_dataset.data():
        target = data['target']
        if lower:
            target = target.lower()
        if (target in train_target_sentiments
                and target in test_target_sentiments):
            train_sentiments = train_target_sentiments[target]
            test_sentiments = test_target_sentiments[target]
            if (len(train_sentiments) == 1 and len(test_sentiments) == 1):
                if train_sentiments == test_sentiments:
                    same_one_sentiments.add(target)

    same_one_samples = targets_to_samples(test_dataset, same_one_sentiments,
                                          lower)
    same_one_ids = [sample['target_id'] for sample in same_one_samples]
    return same_one_ids
예제 #5
0
def different_sentiment(test_dataset: TargetCollection,
                        train_dataset: TargetCollection,
                        lower: bool = True) -> List[str]:
    '''
    Given a test and train dataset will return all of the test dataset sample 
    ids that contain targets that have different sentiment labels with no 
    overlap in the test compared to the train set.

    :param test_dataset: Test TargetCollection
    :param train_dataset: Train TargetCollection
    :param lower: Whether to lower case the target words
    :returns: A list of sample ids from the test dataset.
    '''
    train_target_sentiments = target_sentiments(train_dataset, lower)
    test_target_sentiments = target_sentiments(test_dataset, lower)

    different_sentiments = set()
    for data in test_dataset.data():
        target = data['target']
        if lower:
            target = target.lower()
        if (target in train_target_sentiments
                and target in test_target_sentiments):
            train_sentiments = train_target_sentiments[target]
            test_sentiments = test_target_sentiments[target]
            if not test_sentiments.intersection(train_sentiments):
                different_sentiments.add(target)
    different_samples = targets_to_samples(test_dataset, different_sentiments,
                                           lower)
    different_ids = [sample['target_id'] for sample in different_samples]
    return different_ids
예제 #6
0
def unknown_targets(test_dataset: TargetCollection,
                    train_dataset: TargetCollection,
                    lower: bool = True) -> List[str]:
    '''
    Given a test and train dataset will return all of the test dataset sample 
    ids that contain targets that did not exist in the training data.

    :param test_dataset: Test TargetCollection
    :param train_dataset: Train TargetCollection
    :param lower: Whether to lower case the target words
    :returns: A list of sample ids from the test dataset.
    '''
    train_target_sentiments = target_sentiments(train_dataset, lower)
    test_target_sentiments = target_sentiments(test_dataset, lower)

    unknowns = set()
    for data in test_dataset.data():
        target = data['target']
        if lower:
            target = target.lower()
        if (target in train_target_sentiments
                and target in test_target_sentiments):
            continue
        else:
            unknowns.add(target)

    unknown_samples = targets_to_samples(test_dataset, unknowns, lower)
    unknown_ids = [sample['target_id'] for sample in unknown_samples]
    return unknown_ids
예제 #7
0
def semeval_15_16(file_path, sep_16_from_15=False):
    '''
    Parser for the SemEval 2015 and 2016 datasets.

    :param file_path: File path to the semeval 2014 data
    :param sep_16_from_15: Ensure that the test sets of semeval 2016 is complete \
    seperate from the semeval test set of 2015
    :type file_path: String
    :type sep_16_from_15: bool. Default False
    :returns: A TargetCollection containing Target instances.
    :rtype: TargetCollection
    '''

    file_path = os.path.abspath(file_path)
    file_name, _ = os.path.splitext(os.path.basename(file_path))

    tree = ET.parse(file_path)
    reviews = tree.getroot()
    all_aspect_term_data = []
    if reviews.tag != 'Reviews':
        raise ValueError('The root of all semeval 15/16 xml files should '\
                         'be reviews and not {}'\
                         .format(reviews.tag))
    for review in reviews:
        review_id = review.attrib['rid']
        for sentences in review:
            if sep_16_from_15:
                ids_to_skip = ["en_SnoozeanAMEatery_480032670:4"]
                review_targets = _semeval_extract_data(sentences, file_name,
                                                       sentence_ids_skip=ids_to_skip)
                all_aspect_term_data.extend(review_targets.data())
            else:
                review_targets = _semeval_extract_data(sentences, file_name).data()
                all_aspect_term_data.extend(review_targets)
    return TargetCollection(all_aspect_term_data)
예제 #8
0
 def get_data(id_file, tweets_data, annos_data):
     targets = []
     with open(id_file, 'r') as id_data:
         for tweet_id in id_data:
             tweet_id = tweet_id.strip()
             tweet_data = tweets_data[tweet_id]
             anno_data = annos_data[tweet_id]
             targets.extend(parse_tweet(tweet_data, anno_data, tweet_id))
     return TargetCollection(targets)
예제 #9
0
    def test_targetcoll_stored_sent(self):
        '''
        Test the stored_sentiments function of TargetCollection
        '''

        target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                  'text with Iphone', 1)
        target_example_1 = Target([(1, 5)], '3', 'Iphone',
                                  'text with Iphone', 1)
        target_example_2 = Target([(1, 2)], '2', 'Iphone',
                                  'text with Iphone', -1)
        target_col = TargetCollection([target_example_0, target_example_1,
                                       target_example_2])
        valid_sentiments = set([1, -1])
        test_sentiments = target_col.stored_sentiments()
        self.assertEqual(valid_sentiments, test_sentiments, msg='The unique '\
                         'sentiments in the TargetCollection should be {} and '\
                         'not {}'.format(valid_sentiments, test_sentiments))
예제 #10
0
    def test_target_coll_subset_by_sent(self):
        '''
        Test the subset_by_sentiment function of TargetCollection
        '''

        target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                  'text with Iphone', 'pos', sentence_id='4')
        target_example_1 = Target([(1, 5)], '3', 'Iphone',
                                  'text with Iphone', 'neg', sentence_id='4')
        target_example_2 = Target([(1, 2)], '2', 'Iphone',
                                  'text with Iphone', 'neg', sentence_id='5')
        target_example_3 = Target([(1, 2)], '4', 'Iphone',
                                  'text with Iphone', 'neg', sentence_id='5')
        target_example_4 = Target([(1, 2)], '5', 'Iphone',
                                  'text with Iphone', 'pos', sentence_id='6')
        target_example_5 = Target([(1, 2)], '6', 'Iphone',
                                  'text with Iphone', 'neu', sentence_id='6')
        target_example_6 = Target([(1, 2)], '7', 'Iphone',
                                  'text with Iphone', 'neg', sentence_id='6')
        target_example_7 = Target([(1, 2)], '8', 'Iphone',
                                  'text with Iphone', 'neg', sentence_id='7')
        target_example_8 = Target([(1, 2)], '9', 'Iphone',
                                  'text with Iphone', 'neg', sentence_id='8')
        target_example_9 = Target([(1, 2)], '10', 'Iphone',
                                  'text with Iphone', 'neg', sentence_id='8')
        target_example_10 = Target([(1, 2)], '11', 'Iphone',
                                  'text with Iphone', 'pos', sentence_id='8')
        all_targets = [target_example_0, target_example_1, target_example_2,
                       target_example_3, target_example_4, target_example_5,
                       target_example_6, target_example_7, target_example_8,
                       target_example_9, target_example_10]
        target_col = TargetCollection(all_targets)

        # Test for 2 unique sentiments per sentence
        test_col = target_col.subset_by_sentiment(2)
        valid_col = TargetCollection([target_example_0, target_example_1,
                                      target_example_8, target_example_9,
                                      target_example_10])
        self.assertEqual(valid_col, test_col, msg='Should only return these {}'\
                         ' but has returned this {}'.format(valid_col, test_col))
        # Test for 1 unique sentiments per sentence
        test_col = target_col.subset_by_sentiment(1)
        valid_col = TargetCollection([target_example_7, target_example_2,
                                      target_example_3])
        self.assertEqual(valid_col, test_col, msg='Should only return these {}'\
                         ' but has returned this {}'.format(valid_col, test_col))
        # Test for 3 unique sentiments per sentence
        test_col = target_col.subset_by_sentiment(3)
        valid_col = TargetCollection([target_example_4, target_example_5,
                                      target_example_6])
        self.assertEqual(valid_col, test_col, msg='Should only return these {}'\
                         ' but has returned this {}'.format(valid_col, test_col))
def generate_stats(data_path: Path) -> Dict[str, Union[int, float]]:
    target_data = []
    with data_path.open('r') as data_lines:
        for line in data_lines:
            line = json.loads(line)
            line['spans'] = [tuple(span) for span in line['spans']]
            target_data.append(Target(**line))
    target_data = TargetCollection(target_data)
    target_stats = defaultdict(lambda: 0)
    data_size = len(target_data)
    target_stats['size'] = data_size
    for i in range(1, 3):
        target_stats[f'Distinct sentiment {i}'] = len(
            target_data.subset_by_sentiment(i))
    for data in target_data.data_dict():
        target_stats[data['sentiment']] += 1
    for key, value in target_stats.items():
        if key == 'size':
            continue
        target_stats[key] = value / data_size
    return target_stats
예제 #12
0
    def test_targetcoll_constructor(self):
        '''
        Tests TargetCollection constructor
        '''

        target_example = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                'text with Iphone', 1)
        # Test that it will not accept anything but Target instances
        with self.assertRaises(TypeError, msg='The constructor should only '\
                               'accept an interator as an argument'):
            TargetCollection(1)
        with self.assertRaises(TypeError, msg='The constructor should only '\
                               'accept an interator of Target instances'):
            TargetCollection([1, 2, 3, 4])
        # Should accept the following without any problems
        TargetCollection([])
        TargetCollection([target_example])
        TargetCollection()

        # Testing the case where the list of Targets contains duplicate keys
        another_example = Target([(3, 4)], '2', 'Keys',
                                 'text with Keys', -1)
        dup_example = Target([(3, 10)], '1', 'Pixel',
                             'text with Pixels', 0)
        with self.assertRaises(KeyError, msg='Should raise an error as two of '\
                               'the target instances have the same key'):
            TargetCollection([target_example, another_example, dup_example])
예제 #13
0
        def split_tests(data_: TargetCollection, train_: TargetCollection, 
                        test_: TargetCollection, test_split: float):
            data_size = len(data_)
            train_size = len(train_)
            test_size = len(test_)
            assert train_size == (data_size - int(data_size * test_split))
            assert test_size == int(data_size * test_split)
            assert data_size == (train_size + test_size)

            train_ids = []
            test_ids = []
            for data in train_.data_dict():
                train_ids.append(re.findall('\d+', data['target_id'])[0])
            for data in test_.data_dict():
                test_ids.append(re.findall('\d+', data['target_id'])[0])
            assert len(train_ids) == len(set(train_ids))
            assert len(test_ids) == len(set(test_ids))
            for train_id in train_ids:
                assert train_id not in test_ids
            for test_id in test_ids:
                assert test_id not in train_ids
            return train_ids, test_ids
예제 #14
0
    def _data_to_json(data: TargetCollection, file_path: Path) -> None:
        '''
        Converts the data into json format and saves it to the given file path. 
        The AllenNLP models read the data from json formatted files.

        :param data: data to be saved into json format.
        :param file_path: file location to save the data to.
        '''
        target_data = data.data_dict()
        with file_path.open('w+') as json_file:
            for index, data in enumerate(target_data):
                if 'epoch_number' in data:
                    data['epoch_number'] = list(data['epoch_number'])
                json_encoded_data = json.dumps(data)
                if index != 0:
                    json_encoded_data = f'\n{json_encoded_data}'
                json_file.write(json_encoded_data)
def augmented_dataset(target_related_words_sim: Dict[str, List[Tuple[str,
                                                                     float]]],
                      dataset: TargetCollection, save_fp: Path,
                      lower: bool) -> None:
    '''
    Given a dictionary of target words from the training dataset and the 
    values being all of the related words with their similarity score associated 
    to the target key, TDSA training dataset it will for each sample in the 
    training set check if the sample's target exists as a key in the given 
    dictionary and if so write the sample to the save file along with the 
    related targets and similarity scores under the following keys; 
    `alternative_targets` and `alternative_similarity`
    '''
    training_targets_in_embeddings = set(list(target_related_words_sim.keys()))
    with save_fp.open('w+') as save_file:
        count = 0
        for target_dict in dataset.data_dict():
            original_target = target_dict['target']
            if lower:
                original_target = original_target.lower()
            if original_target in training_targets_in_embeddings:
                alt_targets_similarity = target_related_words_sim[
                    original_target]
                alt_targets_similarity = sorted(alt_targets_similarity,
                                                key=lambda x: x[1],
                                                reverse=True)
                different_targets = [
                    target for target, _ in alt_targets_similarity
                ]
                alternative_similarity = [
                    similarity for _, similarity in alt_targets_similarity
                ]
                target_dict['alternative_targets'] = different_targets
                target_dict['alternative_similarity'] = alternative_similarity
                target_dict['epoch_number'] = list(target_dict['epoch_number'])
                json_target_dict = json.dumps(target_dict)
                if count != 0:
                    json_target_dict = f'\n{json_target_dict}'
                count += 1
                save_file.write(json_target_dict)
예제 #16
0
    def test_targetcoll_data(self):
        '''
        Test the data function of TargetCollection
        '''

        target_col = TargetCollection()
        target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                  'text with Iphone', 1)
        target_example_1 = Target([(1, 5)], '3', 'Iphone',
                                  'text with Iphone', 1)
        target_col.add(target_example_0)
        target_col.add(target_example_1)

        all_data = target_col.data()
        self.assertEqual(target_example_0, all_data[0], msg='First data '\
                         'returned should be the first inserted {} and not '\
                         '{}'.format(target_example_0, all_data[0]))
        self.assertEqual(target_example_1, all_data[1], msg='Second data '\
                         'returned should be the second inserted {} and not '\
                         '{}'.format(target_example_1, all_data[1]))

        target_example_2 = Target([(1, 2)], '2', 'Iphone',
                                  'text with Iphone', 1)
        del target_col['1']
        target_col.add(target_example_2)
        all_data = target_col.data()
        self.assertEqual(target_example_1, all_data[0], msg='First data '\
                         'returned should be the second inserted {} and not '\
                         '{} as the first has been removed'\
                         .format(target_example_1, all_data[0]))
        self.assertEqual(target_example_2, all_data[1], msg='Second data '\
                         'returned should be the third inserted {} and not '\
                         '{} as the first has been removed'\
                         .format(target_example_2, all_data[1]))
        self.assertEqual(2, len(all_data), msg='The length of the data returned'\
                         'shoudl be 2 and not {}'.format(len(all_data)))
예제 #17
0
    def test_targetcoll_set(self):
        '''
        Test the __setitem__ function of TargetCollection
        '''

        target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone',
                                  'text with Iphone', 1)
        target_example_1 = Target([(1, 5)], '3', 'Iphone',
                                  'text with Iphone', 1)
        target_example_2 = Target([(1, 2)], '2', 'Iphone',
                                  'text with Iphone', 1)
        target_col = TargetCollection([target_example_0, target_example_1,
                                       target_example_2])
        target_example_3 = Target([(2, 4)], '5', 'new', 'new text', 0)
        target_example_4 = Target([(1, 3)], '6', 'another', 'another text', 1)
        target_example_5 = Target([(1, 3)], '7', 'another', 'another text', 1)
        target_diff_1 = Target([(4, 5)], '3', 'test', 'test text', 0)

        # Normal case adding a new value
        target_col['5'] = target_example_3
        self.assertEqual(target_col['5'], target_example_3, msg='Cannot add '\
                         'new value. store {} value added {}'\
                         .format(target_col, target_example_3))
        # If key already exists it cannot be added
        with self.assertRaises(KeyError, msg='Should not be able to add value '\
                               '{} as its key {} already exists {}'\
                               .format(target_diff_1, '3', target_col)):
            target_col['3'] = target_diff_1
        with self.assertRaises(KeyError, msg='Value with a different `id` to '\
                               'the key should fail. Key {} Value {}'\
                               .format('7', target_example_4)):
            target_col['7'] = target_example_4
        # Should accept Target instance with no `id`
        del target_example_5['target_id']
        if 'target_id' in target_example_5:
            raise KeyError('{} should not contain `id` key'\
            .format(target_example_5))
        target_col['8'] = target_example_5
예제 #18
0
def multi_word_targets(dataset: TargetCollection, 
                       tokeniser: Callable[[str], List[str]],
                       lower: bool = True) -> List[str]:
    '''
    Given a dataset it will return all of the targets 
    tokenised and then re-combined with `_` to show 
    the multiple words that make up the target if it 
    contains multiple words.
    
    :param dataset: The dataset that contains all of the 
                    targets.
    :param tokeniser: The tokeniser to define if a target 
                      is made up of multiple words
    :param lower: if to lower case the target words.
    :returns: A list of targets where if they are made up of 
              multiple words the will now have an `_` between 
              them e.g. `tesco supermarket` would be 
              `tesco_supermarket`
    '''
    targets = dataset.target_set(lower=lower)
    tokenised_targets = [tokeniser(target) for target in targets]
    multi_word_targets = ['_'.join(target) for target in tokenised_targets]
    return multi_word_targets
예제 #19
0
def target_sentiments(dataset: TargetCollection,
                      lower: bool = True) -> Dict[str, Set[Any]]:
    '''
    Given a dataset will return a dictionary of targets and the sentiment 
    that has been associated to those targets.

    E.g. within the dataset that `target` `camera` may have only been seen 
    with a positive and a negative label but not neutral therefore in the 
    returned dictionary it would be {`camera`: [`positive`, `negative`]}

    :param dataset: TargetCollection containing samples
    :param lower: Whether to lower case the target words.
    :returns: A dictionary where the keys are target words and the values 
              are the sentiment values that have been associated to those 
              targets.
    '''
    targets_sentiments = defaultdict(set)
    for data in dataset.data():
        target = data['target']
        if lower:
            target = target.lower()
        targets_sentiments[target].add(data['sentiment'])
    return targets_sentiments
예제 #20
0
def similar_sentiment(test_dataset: TargetCollection,
                      train_dataset: TargetCollection,
                      lower: bool = True) -> List[str]:
    '''
    Given a test and train dataset will return all of the test dataset sample 
    ids that contain targets that have occured more than once in the train or 
    test sets with at least some overlap between the test sentiment and train 
    but not identical. E.g. the target `camera` could occur with `positive` and
    `negative` sentiment in the test set and only `negative` in the train set.

    :param test_dataset: Test TargetCollection
    :param train_dataset: Train TargetCollection
    :param lower: Whether to lower case the target words
    :returns: A list of sample ids from the test dataset.
    '''
    train_target_sentiments = target_sentiments(train_dataset, lower)
    test_target_sentiments = target_sentiments(test_dataset, lower)

    similar_sentiments = set()
    for data in test_dataset.data():
        target = data['target']
        if lower:
            target = target.lower()
        if (target in train_target_sentiments
                and target in test_target_sentiments):
            train_sentiments = train_target_sentiments[target]
            test_sentiments = test_target_sentiments[target]
            if (len(train_sentiments) > 1 or len(test_sentiments) > 1):
                if train_sentiments == test_sentiments:
                    continue
                if test_sentiments.intersection(train_sentiments):
                    similar_sentiments.add(target)
    similar_samples = targets_to_samples(test_dataset, similar_sentiments,
                                         lower)
    similar_ids = [sample['target_id'] for sample in similar_samples]
    return similar_ids
예제 #21
0
    def _predict_iter(self, data: TargetCollection
                      ) -> Generator[Dict[str, Any], None, None]:
        '''
        Iterates over the predictions and yields one prediction at a time.

        This is a useful wrapper as it performs the data pre-processing and 
        assertion checks.

        :param data: Data to predict on
        :yields: A dictionary containing `class_probabilities` and `label`.
        '''
        no_model_error = 'There is no model to make predictions, either fit '\
                         'or load a model.'
        assert self.model, no_model_error
        self.model.eval()

        all_model_params = Params.from_file(self._param_fp)

        reader_params = all_model_params.get("dataset_reader")
        dataset_reader = DatasetReader.from_params(reader_params)
        predictor = TargetPredictor(self.model, dataset_reader)

        batch_size = 64
        if 'iterator' in all_model_params:
            iter_params = all_model_params.get("iterator")
            if 'batch_size' in iter_params:
                batch_size = iter_params['batch_size']
        
        json_data = data.data_dict()
        # Reference
        # https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
        for i in range(0, len(json_data), batch_size):
            json_data_batch = json_data[i:i + batch_size]
            predictions = predictor.predict_batch_json(json_data_batch)
            for prediction in predictions:
                yield prediction
예제 #22
0
def targets_to_samples(dataset: TargetCollection,
                       targets: Set[str],
                       lower: bool = True) -> List[Target]:
    '''
    Given a dataset and a set of target words, it will return a subset of the 
    dataset where all samples in the subset have target words that are in the 
    targets set.

    :param dataset: TargetCollection containing samples
    :param targets: A set of target words used to subset the dataset
    :param lower: Whether to lower case the target words. If this is True 
                  it is up to you to ensure all the words in the `targets` set 
                  have been lower cased.
    :returns: A subset of the dataset where all targets in the subset are 
              within the `targets` set.
    '''
    samples = []
    for data in dataset.data():
        target = data['target']
        if lower:
            target = target.lower()
        if target in targets:
            samples.append(data)
    return samples
import argparse
import json
from pathlib import Path

from bella.data_types import TargetCollection


def parse_path(path_string: str) -> Path:
    path_string = Path(path_string).resolve()
    return path_string


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument("tdsa_dataset_fp",
                        type=parse_path,
                        help="File path to the TDSA dataset")
    parser.add_argument("target_set_fp",
                        type=parse_path,
                        help='File path to save the targets to')
    parser.add_argument
    args = parser.parse_args()
    data = TargetCollection.load_from_json(args.tdsa_dataset_fp)
    all_targets = list(data.target_set())
    with args.target_set_fp.open('w+') as target_set_file:
        json.dump(all_targets, target_set_file)
예제 #24
0
    lr_dir.mkdir(parents=True, exist_ok=True)
    model_config_dir = args.model_config_dir

    # Models
    model_names = ['atae', 'bilinear', 'ian', 'tdlstm', 'tclstm']
    bilstm_names = ['']
    all_model_names = product(model_names, bilstm_names)
    all_models = []
    for model_name, bilstm_name in all_model_names:
        model_name = f'{model_name}{bilstm_name}'.strip()
        model_config_fp = Path(model_config_dir, f'{model_name}.json')
        all_models.append(AllenNLPModel(model_name, model_config_fp))

    # Data
    dataset_name = args.dataset_name
    train = TargetCollection.load_from_json(args.train_fp)

    model_data = product(all_models, [train])
    for model, train_data in model_data:
        logger.info(f'Finding learning rate for {model.name} on {dataset_name} '
                    f'dataset using {args.num_batches} batches')
        
        model_dir = Path(lr_dir, model.name)
        data_model_dir = Path(model_dir, f'{dataset_name}')
        if not data_model_dir.exists():
            data_model_dir.mkdir(parents=True)
        try:
            model.find_learning_rate(train, data_model_dir, 
                                     {'num_batches': args.num_batches})
        except:
            error_msg = f'Finding learning rate for {model.name} on {dataset_name} ' \
        'ian': 'IAN',
        'tdlstm': 'TDLSTM',
        'tclstm': 'TCLSTM'
    }
    dataset_name = args.dataset_name
    embedding_folder_names = [
        'baseline', 'baseline_lm', 'baseline_lm_embedding', 'ds_embedding',
        'ds_lm', 'ds_lm_embedding', 'ds_lm_ds_embedding'
    ]
    embedding_names = [
        'Glove', 'ELMo T', 'ELMo T + Glove', 'DS Word2Vec', 'DS ELMo T',
        'DS ELMo T + Glove', 'DS ELMo T + DS Glove'
    ]

    test_data_fp = args.test_data_fp
    test_data = TargetCollection.load_from_json(test_data_fp)
    dataset_name_flag = [('Test', test_data, True)]
    if args.val_fp:
        val_data = TargetCollection.load_from_json(args.val_fp)
        dataset_name_flag.append(('Validation', val_data, False))

    y_score = []
    x_model_names = []
    embedding_color = []
    dataset_split_names = []

    embedding_folder_and_names = list(
        zip(embedding_folder_names, embedding_names))
    for dataset_split_name, dataset, test_flag in dataset_name_flag:
        for embedding_folder_name, embedding_name in embedding_folder_and_names:
            for model_name in model_names:
            similarity_field = np.array(target_data[similarity_field_name])
            if is_language_model and threshold:
                similarity_field = target_data['original_perplexity'] - similarity_field
                above_original_perplexity_index = np.argmax((similarity_field <= 0) + 0) 
                similarity_field = similarity_field[:above_original_perplexity_index]
                if len(similarity_field) < k_similar:
                    temp_k = len(similarity_field)
            elif (not is_language_model) and threshold:
                above_threshold_index = np.argmin((similarity_field >= threshold) + 0)
                similarity_field = similarity_field[:above_threshold_index]
                if len(similarity_field) < k_similar:
                    temp_k = len(similarity_field)
            # For each of the filtered alternative targets it creates a json 
            # like object that will be used to store it in a collection to then 
            # save to a json file
            alternative_targets = target_data['alternative_targets'][:temp_k]
            for index, alternative_target in enumerate(alternative_targets):
                new_target_id = f'{target_id}_{index}'
                new_target_data = exchange_targets(target_data, alternative_target, 
                                                   new_target_id)
                # sanitizing the target dataset.
                new_target_data.pop('alternative_targets')
                new_target_data.pop(similarity_field_name)
                if is_language_model:
                    new_target_data.pop('original_perplexity')
                new_target_dataset.append(Target(**new_target_data))
    print(f'Size of the expanded dataset {len(new_target_dataset)}')
    new_target_dataset = TargetCollection(new_target_dataset)
    new_target_dataset.to_json_file(str(args.dataset_save_fp),cache=False)           
            
예제 #27
0
        'tclstm': 'TCLSTM',
        'atae_ds_lm_embedding': 'ATAE ELMo T'
    }
    dataset_name = args.dataset_name
    values_of_k = [2, 3, 5, 10]
    augmentation_technique = ['embedding', 'lm']
    thresholding = ['no_threshold_', 'threshold_']
    technique_threshold_mapper = {
        'no_threshold_embedding': 'Embedding',
        'threshold_embedding': 'Embedding T',
        'no_threshold_lm': 'LM',
        'threshold_lm': 'LM T'
    }

    test_data_fp = args.test_data_fp
    test_data = TargetCollection.load_from_json(test_data_fp)

    test = True
    if args.val:
        test = False

    folder_names = list(
        itertools.product(thresholding, augmentation_technique, values_of_k))

    y_score = []
    x_k = []
    all_techniques = []
    all_model_names = []

    for threshold, tech, k in folder_names:
        folder_name = f'{threshold}{tech}{str(k)}'
예제 #28
0
        }
    else:
        model_name_mapper = {
            model_name: model_name
            for model_name in model_names
        }
    bilstm_names = ['']
    all_model_names = product(model_names, bilstm_names)
    all_models = []
    for model_name, bilstm_name in all_model_names:
        model_name = f'{model_name}{bilstm_name}'.strip()
        model_config_fp = Path(model_config_dir, f'{model_name}.json')
        all_models.append(
            AllenNLPModel(model_name_mapper[model_name], model_config_fp))
    # Data
    path_dataset = lambda _dir, name: TargetCollection.load_from_json(
        Path(_dir, name))
    train = path_dataset(data_dir, f'{dataset_name} Train')
    logger.info(f'Size of the original dataset {len(train)}')
    if augmented_fp is not None:
        augmented_data = TargetCollection.load_from_json(augmented_fp)
        logger.info(f'Size of the augmented dataset {len(augmented_data)}')
        train = TargetCollection.combine_collections(train, augmented_data)
        logger.info(f'Size of the training augmented dataset {len(train)}')
    val = path_dataset(data_dir, f'{dataset_name} Val')
    logger.info(f'Size of the validation set {len(val)}')
    test = path_dataset(data_dir, f'{dataset_name} Test')
    logger.info(f'Size of the test set {len(test)}')

    model_data = product(all_models, [(train, val, test)])
    for model, data in model_data:
        logger.info(f'Running Model {model.name} on {dataset_name} dataset '
예제 #29
0
    parser.add_argument("dataset_name",
                        type=str,
                        choices=['Laptop', 'Restaurant', 'Election'])
    parser.add_argument("--force_space",
                        action='store_true',
                        help=force_space_help)
    args = parser.parse_args()

    data_dir = args.data_dir
    dataset_name = args.dataset_name
    tokeniser = spacy.blank('en')
    split_names = ['Train', 'Val', 'Test']

    for split_name in split_names:
        dataset_fp = Path(data_dir, f'{dataset_name} {split_name}')
        dataset = TargetCollection.load_from_json(dataset_fp)

        retrieve_target_from_tokens = []
        cannot_retrieve_target_from_tokens = []

        for target in dataset.data_dict():
            text = target['text']
            target_word = target['target'].strip()
            target_start_offset = target['spans'][0][0]
            target_end_offset = target['spans'][0][1]
            if args.force_space:
                before_text = text[:target_start_offset].strip()
                after_text = text[target_end_offset:].strip()
                new_target_word = f' {target_word} '
                target_start_offset = len(before_text) + 1
                target_end_offset = target_start_offset + len(target_word)
                        help="File path to the embedding",
                        type=parse_path)
    parser.add_argument("additional_targets_fp",
                        type=parse_path,
                        help='File Path to additional targets')
    parser.add_argument("augmented_dataset_fp",
                        type=parse_path,
                        help=augmented_dataset_help)
    parser.add_argument("tokeniser", type=str, choices=tokeniser_choices)
    parser.add_argument("--lower", action="store_true")
    args = parser.parse_args()

    # Load tokeniser
    if args.tokeniser == 'spacy':
        tokeniser = allen_spacy_tokeniser
    else:
        raise ValueError(
            f'Tokeniser has to be one of the following {tokeniser_choices}')

    training_data = TargetCollection.load_from_json(args.train_fp)
    embedding = Word2Vec.load(str(args.embedding_fp))
    target_related_words_sim: Dict[str, List[Tuple[str, float]]]
    target_related_words_sim = word_embedding_augmentation(training_data,
                                                           embedding,
                                                           lower=args.lower,
                                                           k_nearest=-1,
                                                           tokeniser=tokeniser)
    augmented_dataset(target_related_words_sim,
                      training_data,
                      args.augmented_dataset_fp,
                      lower=args.lower)