Example #1
0
        def split_tests(data_: TargetCollection, train_: TargetCollection, 
                        test_: TargetCollection, test_split: float):
            data_size = len(data_)
            train_size = len(train_)
            test_size = len(test_)
            assert train_size == (data_size - int(data_size * test_split))
            assert test_size == int(data_size * test_split)
            assert data_size == (train_size + test_size)

            train_ids = []
            test_ids = []
            for data in train_.data_dict():
                train_ids.append(re.findall('\d+', data['target_id'])[0])
            for data in test_.data_dict():
                test_ids.append(re.findall('\d+', data['target_id'])[0])
            assert len(train_ids) == len(set(train_ids))
            assert len(test_ids) == len(set(test_ids))
            for train_id in train_ids:
                assert train_id not in test_ids
            for test_id in test_ids:
                assert test_id not in train_ids
            return train_ids, test_ids
    def _data_to_json(data: TargetCollection, file_path: Path) -> None:
        '''
        Converts the data into json format and saves it to the given file path. 
        The AllenNLP models read the data from json formatted files.

        :param data: data to be saved into json format.
        :param file_path: file location to save the data to.
        '''
        target_data = data.data_dict()
        with file_path.open('w+') as json_file:
            for index, data in enumerate(target_data):
                if 'epoch_number' in data:
                    data['epoch_number'] = list(data['epoch_number'])
                json_encoded_data = json.dumps(data)
                if index != 0:
                    json_encoded_data = f'\n{json_encoded_data}'
                json_file.write(json_encoded_data)
def generate_stats(data_path: Path) -> Dict[str, Union[int, float]]:
    target_data = []
    with data_path.open('r') as data_lines:
        for line in data_lines:
            line = json.loads(line)
            line['spans'] = [tuple(span) for span in line['spans']]
            target_data.append(Target(**line))
    target_data = TargetCollection(target_data)
    target_stats = defaultdict(lambda: 0)
    data_size = len(target_data)
    target_stats['size'] = data_size
    for i in range(1, 3):
        target_stats[f'Distinct sentiment {i}'] = len(
            target_data.subset_by_sentiment(i))
    for data in target_data.data_dict():
        target_stats[data['sentiment']] += 1
    for key, value in target_stats.items():
        if key == 'size':
            continue
        target_stats[key] = value / data_size
    return target_stats
def augmented_dataset(target_related_words_sim: Dict[str, List[Tuple[str,
                                                                     float]]],
                      dataset: TargetCollection, save_fp: Path,
                      lower: bool) -> None:
    '''
    Given a dictionary of target words from the training dataset and the 
    values being all of the related words with their similarity score associated 
    to the target key, TDSA training dataset it will for each sample in the 
    training set check if the sample's target exists as a key in the given 
    dictionary and if so write the sample to the save file along with the 
    related targets and similarity scores under the following keys; 
    `alternative_targets` and `alternative_similarity`
    '''
    training_targets_in_embeddings = set(list(target_related_words_sim.keys()))
    with save_fp.open('w+') as save_file:
        count = 0
        for target_dict in dataset.data_dict():
            original_target = target_dict['target']
            if lower:
                original_target = original_target.lower()
            if original_target in training_targets_in_embeddings:
                alt_targets_similarity = target_related_words_sim[
                    original_target]
                alt_targets_similarity = sorted(alt_targets_similarity,
                                                key=lambda x: x[1],
                                                reverse=True)
                different_targets = [
                    target for target, _ in alt_targets_similarity
                ]
                alternative_similarity = [
                    similarity for _, similarity in alt_targets_similarity
                ]
                target_dict['alternative_targets'] = different_targets
                target_dict['alternative_similarity'] = alternative_similarity
                target_dict['epoch_number'] = list(target_dict['epoch_number'])
                json_target_dict = json.dumps(target_dict)
                if count != 0:
                    json_target_dict = f'\n{json_target_dict}'
                count += 1
                save_file.write(json_target_dict)
    def _predict_iter(self, data: TargetCollection
                      ) -> Generator[Dict[str, Any], None, None]:
        '''
        Iterates over the predictions and yields one prediction at a time.

        This is a useful wrapper as it performs the data pre-processing and 
        assertion checks.

        :param data: Data to predict on
        :yields: A dictionary containing `class_probabilities` and `label`.
        '''
        no_model_error = 'There is no model to make predictions, either fit '\
                         'or load a model.'
        assert self.model, no_model_error
        self.model.eval()

        all_model_params = Params.from_file(self._param_fp)

        reader_params = all_model_params.get("dataset_reader")
        dataset_reader = DatasetReader.from_params(reader_params)
        predictor = TargetPredictor(self.model, dataset_reader)

        batch_size = 64
        if 'iterator' in all_model_params:
            iter_params = all_model_params.get("iterator")
            if 'batch_size' in iter_params:
                batch_size = iter_params['batch_size']
        
        json_data = data.data_dict()
        # Reference
        # https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
        for i in range(0, len(json_data), batch_size):
            json_data_batch = json_data[i:i + batch_size]
            predictions = predictor.predict_batch_json(json_data_batch)
            for prediction in predictions:
                yield prediction