def test_targetcoll_get(self): ''' Test the __getitem__ function of TargetCollection ''' target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 1) target_example_1 = Target([(1, 5)], '3', 'Iphone', 'text with Iphone', 1) target_example_2 = Target([(1, 2)], '2', 'Iphone', 'text with Iphone', 1) target_col = TargetCollection([target_example_0, target_example_1, target_example_2]) # Test normal case self.assertEqual(target_example_2, target_col['2'], msg='Cannot access '\ 'data using keys. key used {} collection {}'\ .format('2', target_col)) self.assertEqual(target_example_1, target_col.get('3'), msg='Cannot '\ 'access data using the get method.') self.assertEqual(None, target_col.get('5'), msg='Default value for '\ 'get not working should be None not {}'\ .format(target_col.get('5'))) # Test that it raises a KeyError when key does not exist with self.assertRaises(KeyError, msg='Should produce a key error when '\ 'the data does not exist'): target_col['5']
def dong(file_path): ''' Given file path to the `Li Dong <https://github.com/bluemonk482/tdparse/tree/master/data/lidong>`_ sentiment data it will parse the data and return it as a list of dictionaries. :param file_path: File Path to the annotated data :type file_path: String :returns: A TargetCollection containing Target instances. :rtype: TargetCollection ''' file_path = os.path.abspath(file_path) if not os.path.isfile(file_path): raise FileNotFoundError('This file does not exist {}'.format(file_path)) file_name, _ = os.path.splitext(os.path.basename(file_path)) sentiment_range = [-1, 0, 1] sentiment_data = TargetCollection() with open(file_path, 'r') as dong_file: sent_dict = {} for index, line in enumerate(dong_file): divisible = index + 1 line = line.strip() if divisible % 3 == 1: sent_dict['text'] = line elif divisible % 3 == 2: sent_dict['target'] = line elif divisible % 3 == 0: sentiment = int(line) if sentiment not in sentiment_range: raise ValueError('The sentiment has to be one of the '\ 'following values {} not {}'\ .format(sentiment_range, sentiment)) sent_dict['sentiment'] = int(line) text = sent_dict['text'].lower() target = sent_dict['target'].lower() offsets = [match.span() for match in re.finditer(target, text)] if len(target.split()) > 1: joined_target = ''.join(target.split()) offsets.extend([match.span() for match in re.finditer(joined_target, text)]) sent_dict['spans'] = offsets sent_id = file_name + str(len(sentiment_data)) # Sentence ID is the same as the target as there is only one # target per sentence sent_dict['sentence_id'] = sent_id sent_dict['target_id'] = sent_id sent_target = Target(**sent_dict) sentiment_data.add(sent_target) sent_dict = {} else: raise Exception('Problem') return sentiment_data
def test_targetcoll_add(self): ''' Test the add function of TargetCollection ''' target_col = TargetCollection() target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 1) target_example_1 = Target([(1, 5)], '3', 'Iphone', 'text with Iphone', 1) # Ensure the normal case works target_col.add(target_example_0) self.assertEqual(target_col['1'], target_example_0, msg='Test that {}' \ ' has been added to {}'\ .format(target_example_0, target_col)) with self.assertRaises(TypeError, msg='Should not be able to add a dict'): target_col.add({'target_id' : '2'}) with self.assertRaises(ValueError, msg='Should not be able to add a '\ 'Target that has no `id`'): del target_example_1['target_id'] if 'target_id' in target_example_1: raise KeyError('{} should not contain `id` key'\ .format(target_example_1)) target_col.add(target_example_1)
def same_one_sentiment(test_dataset: TargetCollection, train_dataset: TargetCollection, lower: bool = True) -> List[str]: ''' Given a test and train dataset will return all of the test dataset sample ids that contain targets that have the same one sentiment label associated to them in the train and test sets. :param test_dataset: Test TargetCollection :param train_dataset: Train TargetCollection :param lower: Whether to lower case the target words :returns: A list of sample ids from the test dataset. ''' train_target_sentiments = target_sentiments(train_dataset, lower) test_target_sentiments = target_sentiments(test_dataset, lower) same_one_sentiments = set() for data in test_dataset.data(): target = data['target'] if lower: target = target.lower() if (target in train_target_sentiments and target in test_target_sentiments): train_sentiments = train_target_sentiments[target] test_sentiments = test_target_sentiments[target] if (len(train_sentiments) == 1 and len(test_sentiments) == 1): if train_sentiments == test_sentiments: same_one_sentiments.add(target) same_one_samples = targets_to_samples(test_dataset, same_one_sentiments, lower) same_one_ids = [sample['target_id'] for sample in same_one_samples] return same_one_ids
def different_sentiment(test_dataset: TargetCollection, train_dataset: TargetCollection, lower: bool = True) -> List[str]: ''' Given a test and train dataset will return all of the test dataset sample ids that contain targets that have different sentiment labels with no overlap in the test compared to the train set. :param test_dataset: Test TargetCollection :param train_dataset: Train TargetCollection :param lower: Whether to lower case the target words :returns: A list of sample ids from the test dataset. ''' train_target_sentiments = target_sentiments(train_dataset, lower) test_target_sentiments = target_sentiments(test_dataset, lower) different_sentiments = set() for data in test_dataset.data(): target = data['target'] if lower: target = target.lower() if (target in train_target_sentiments and target in test_target_sentiments): train_sentiments = train_target_sentiments[target] test_sentiments = test_target_sentiments[target] if not test_sentiments.intersection(train_sentiments): different_sentiments.add(target) different_samples = targets_to_samples(test_dataset, different_sentiments, lower) different_ids = [sample['target_id'] for sample in different_samples] return different_ids
def unknown_targets(test_dataset: TargetCollection, train_dataset: TargetCollection, lower: bool = True) -> List[str]: ''' Given a test and train dataset will return all of the test dataset sample ids that contain targets that did not exist in the training data. :param test_dataset: Test TargetCollection :param train_dataset: Train TargetCollection :param lower: Whether to lower case the target words :returns: A list of sample ids from the test dataset. ''' train_target_sentiments = target_sentiments(train_dataset, lower) test_target_sentiments = target_sentiments(test_dataset, lower) unknowns = set() for data in test_dataset.data(): target = data['target'] if lower: target = target.lower() if (target in train_target_sentiments and target in test_target_sentiments): continue else: unknowns.add(target) unknown_samples = targets_to_samples(test_dataset, unknowns, lower) unknown_ids = [sample['target_id'] for sample in unknown_samples] return unknown_ids
def semeval_15_16(file_path, sep_16_from_15=False): ''' Parser for the SemEval 2015 and 2016 datasets. :param file_path: File path to the semeval 2014 data :param sep_16_from_15: Ensure that the test sets of semeval 2016 is complete \ seperate from the semeval test set of 2015 :type file_path: String :type sep_16_from_15: bool. Default False :returns: A TargetCollection containing Target instances. :rtype: TargetCollection ''' file_path = os.path.abspath(file_path) file_name, _ = os.path.splitext(os.path.basename(file_path)) tree = ET.parse(file_path) reviews = tree.getroot() all_aspect_term_data = [] if reviews.tag != 'Reviews': raise ValueError('The root of all semeval 15/16 xml files should '\ 'be reviews and not {}'\ .format(reviews.tag)) for review in reviews: review_id = review.attrib['rid'] for sentences in review: if sep_16_from_15: ids_to_skip = ["en_SnoozeanAMEatery_480032670:4"] review_targets = _semeval_extract_data(sentences, file_name, sentence_ids_skip=ids_to_skip) all_aspect_term_data.extend(review_targets.data()) else: review_targets = _semeval_extract_data(sentences, file_name).data() all_aspect_term_data.extend(review_targets) return TargetCollection(all_aspect_term_data)
def get_data(id_file, tweets_data, annos_data): targets = [] with open(id_file, 'r') as id_data: for tweet_id in id_data: tweet_id = tweet_id.strip() tweet_data = tweets_data[tweet_id] anno_data = annos_data[tweet_id] targets.extend(parse_tweet(tweet_data, anno_data, tweet_id)) return TargetCollection(targets)
def test_targetcoll_stored_sent(self): ''' Test the stored_sentiments function of TargetCollection ''' target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 1) target_example_1 = Target([(1, 5)], '3', 'Iphone', 'text with Iphone', 1) target_example_2 = Target([(1, 2)], '2', 'Iphone', 'text with Iphone', -1) target_col = TargetCollection([target_example_0, target_example_1, target_example_2]) valid_sentiments = set([1, -1]) test_sentiments = target_col.stored_sentiments() self.assertEqual(valid_sentiments, test_sentiments, msg='The unique '\ 'sentiments in the TargetCollection should be {} and '\ 'not {}'.format(valid_sentiments, test_sentiments))
def test_target_coll_subset_by_sent(self): ''' Test the subset_by_sentiment function of TargetCollection ''' target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 'pos', sentence_id='4') target_example_1 = Target([(1, 5)], '3', 'Iphone', 'text with Iphone', 'neg', sentence_id='4') target_example_2 = Target([(1, 2)], '2', 'Iphone', 'text with Iphone', 'neg', sentence_id='5') target_example_3 = Target([(1, 2)], '4', 'Iphone', 'text with Iphone', 'neg', sentence_id='5') target_example_4 = Target([(1, 2)], '5', 'Iphone', 'text with Iphone', 'pos', sentence_id='6') target_example_5 = Target([(1, 2)], '6', 'Iphone', 'text with Iphone', 'neu', sentence_id='6') target_example_6 = Target([(1, 2)], '7', 'Iphone', 'text with Iphone', 'neg', sentence_id='6') target_example_7 = Target([(1, 2)], '8', 'Iphone', 'text with Iphone', 'neg', sentence_id='7') target_example_8 = Target([(1, 2)], '9', 'Iphone', 'text with Iphone', 'neg', sentence_id='8') target_example_9 = Target([(1, 2)], '10', 'Iphone', 'text with Iphone', 'neg', sentence_id='8') target_example_10 = Target([(1, 2)], '11', 'Iphone', 'text with Iphone', 'pos', sentence_id='8') all_targets = [target_example_0, target_example_1, target_example_2, target_example_3, target_example_4, target_example_5, target_example_6, target_example_7, target_example_8, target_example_9, target_example_10] target_col = TargetCollection(all_targets) # Test for 2 unique sentiments per sentence test_col = target_col.subset_by_sentiment(2) valid_col = TargetCollection([target_example_0, target_example_1, target_example_8, target_example_9, target_example_10]) self.assertEqual(valid_col, test_col, msg='Should only return these {}'\ ' but has returned this {}'.format(valid_col, test_col)) # Test for 1 unique sentiments per sentence test_col = target_col.subset_by_sentiment(1) valid_col = TargetCollection([target_example_7, target_example_2, target_example_3]) self.assertEqual(valid_col, test_col, msg='Should only return these {}'\ ' but has returned this {}'.format(valid_col, test_col)) # Test for 3 unique sentiments per sentence test_col = target_col.subset_by_sentiment(3) valid_col = TargetCollection([target_example_4, target_example_5, target_example_6]) self.assertEqual(valid_col, test_col, msg='Should only return these {}'\ ' but has returned this {}'.format(valid_col, test_col))
def generate_stats(data_path: Path) -> Dict[str, Union[int, float]]: target_data = [] with data_path.open('r') as data_lines: for line in data_lines: line = json.loads(line) line['spans'] = [tuple(span) for span in line['spans']] target_data.append(Target(**line)) target_data = TargetCollection(target_data) target_stats = defaultdict(lambda: 0) data_size = len(target_data) target_stats['size'] = data_size for i in range(1, 3): target_stats[f'Distinct sentiment {i}'] = len( target_data.subset_by_sentiment(i)) for data in target_data.data_dict(): target_stats[data['sentiment']] += 1 for key, value in target_stats.items(): if key == 'size': continue target_stats[key] = value / data_size return target_stats
def test_targetcoll_constructor(self): ''' Tests TargetCollection constructor ''' target_example = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 1) # Test that it will not accept anything but Target instances with self.assertRaises(TypeError, msg='The constructor should only '\ 'accept an interator as an argument'): TargetCollection(1) with self.assertRaises(TypeError, msg='The constructor should only '\ 'accept an interator of Target instances'): TargetCollection([1, 2, 3, 4]) # Should accept the following without any problems TargetCollection([]) TargetCollection([target_example]) TargetCollection() # Testing the case where the list of Targets contains duplicate keys another_example = Target([(3, 4)], '2', 'Keys', 'text with Keys', -1) dup_example = Target([(3, 10)], '1', 'Pixel', 'text with Pixels', 0) with self.assertRaises(KeyError, msg='Should raise an error as two of '\ 'the target instances have the same key'): TargetCollection([target_example, another_example, dup_example])
def split_tests(data_: TargetCollection, train_: TargetCollection, test_: TargetCollection, test_split: float): data_size = len(data_) train_size = len(train_) test_size = len(test_) assert train_size == (data_size - int(data_size * test_split)) assert test_size == int(data_size * test_split) assert data_size == (train_size + test_size) train_ids = [] test_ids = [] for data in train_.data_dict(): train_ids.append(re.findall('\d+', data['target_id'])[0]) for data in test_.data_dict(): test_ids.append(re.findall('\d+', data['target_id'])[0]) assert len(train_ids) == len(set(train_ids)) assert len(test_ids) == len(set(test_ids)) for train_id in train_ids: assert train_id not in test_ids for test_id in test_ids: assert test_id not in train_ids return train_ids, test_ids
def _data_to_json(data: TargetCollection, file_path: Path) -> None: ''' Converts the data into json format and saves it to the given file path. The AllenNLP models read the data from json formatted files. :param data: data to be saved into json format. :param file_path: file location to save the data to. ''' target_data = data.data_dict() with file_path.open('w+') as json_file: for index, data in enumerate(target_data): if 'epoch_number' in data: data['epoch_number'] = list(data['epoch_number']) json_encoded_data = json.dumps(data) if index != 0: json_encoded_data = f'\n{json_encoded_data}' json_file.write(json_encoded_data)
def augmented_dataset(target_related_words_sim: Dict[str, List[Tuple[str, float]]], dataset: TargetCollection, save_fp: Path, lower: bool) -> None: ''' Given a dictionary of target words from the training dataset and the values being all of the related words with their similarity score associated to the target key, TDSA training dataset it will for each sample in the training set check if the sample's target exists as a key in the given dictionary and if so write the sample to the save file along with the related targets and similarity scores under the following keys; `alternative_targets` and `alternative_similarity` ''' training_targets_in_embeddings = set(list(target_related_words_sim.keys())) with save_fp.open('w+') as save_file: count = 0 for target_dict in dataset.data_dict(): original_target = target_dict['target'] if lower: original_target = original_target.lower() if original_target in training_targets_in_embeddings: alt_targets_similarity = target_related_words_sim[ original_target] alt_targets_similarity = sorted(alt_targets_similarity, key=lambda x: x[1], reverse=True) different_targets = [ target for target, _ in alt_targets_similarity ] alternative_similarity = [ similarity for _, similarity in alt_targets_similarity ] target_dict['alternative_targets'] = different_targets target_dict['alternative_similarity'] = alternative_similarity target_dict['epoch_number'] = list(target_dict['epoch_number']) json_target_dict = json.dumps(target_dict) if count != 0: json_target_dict = f'\n{json_target_dict}' count += 1 save_file.write(json_target_dict)
def test_targetcoll_data(self): ''' Test the data function of TargetCollection ''' target_col = TargetCollection() target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 1) target_example_1 = Target([(1, 5)], '3', 'Iphone', 'text with Iphone', 1) target_col.add(target_example_0) target_col.add(target_example_1) all_data = target_col.data() self.assertEqual(target_example_0, all_data[0], msg='First data '\ 'returned should be the first inserted {} and not '\ '{}'.format(target_example_0, all_data[0])) self.assertEqual(target_example_1, all_data[1], msg='Second data '\ 'returned should be the second inserted {} and not '\ '{}'.format(target_example_1, all_data[1])) target_example_2 = Target([(1, 2)], '2', 'Iphone', 'text with Iphone', 1) del target_col['1'] target_col.add(target_example_2) all_data = target_col.data() self.assertEqual(target_example_1, all_data[0], msg='First data '\ 'returned should be the second inserted {} and not '\ '{} as the first has been removed'\ .format(target_example_1, all_data[0])) self.assertEqual(target_example_2, all_data[1], msg='Second data '\ 'returned should be the third inserted {} and not '\ '{} as the first has been removed'\ .format(target_example_2, all_data[1])) self.assertEqual(2, len(all_data), msg='The length of the data returned'\ 'shoudl be 2 and not {}'.format(len(all_data)))
def test_targetcoll_set(self): ''' Test the __setitem__ function of TargetCollection ''' target_example_0 = Target([(3, 5), (6, 8)], '1', 'Iphone', 'text with Iphone', 1) target_example_1 = Target([(1, 5)], '3', 'Iphone', 'text with Iphone', 1) target_example_2 = Target([(1, 2)], '2', 'Iphone', 'text with Iphone', 1) target_col = TargetCollection([target_example_0, target_example_1, target_example_2]) target_example_3 = Target([(2, 4)], '5', 'new', 'new text', 0) target_example_4 = Target([(1, 3)], '6', 'another', 'another text', 1) target_example_5 = Target([(1, 3)], '7', 'another', 'another text', 1) target_diff_1 = Target([(4, 5)], '3', 'test', 'test text', 0) # Normal case adding a new value target_col['5'] = target_example_3 self.assertEqual(target_col['5'], target_example_3, msg='Cannot add '\ 'new value. store {} value added {}'\ .format(target_col, target_example_3)) # If key already exists it cannot be added with self.assertRaises(KeyError, msg='Should not be able to add value '\ '{} as its key {} already exists {}'\ .format(target_diff_1, '3', target_col)): target_col['3'] = target_diff_1 with self.assertRaises(KeyError, msg='Value with a different `id` to '\ 'the key should fail. Key {} Value {}'\ .format('7', target_example_4)): target_col['7'] = target_example_4 # Should accept Target instance with no `id` del target_example_5['target_id'] if 'target_id' in target_example_5: raise KeyError('{} should not contain `id` key'\ .format(target_example_5)) target_col['8'] = target_example_5
def multi_word_targets(dataset: TargetCollection, tokeniser: Callable[[str], List[str]], lower: bool = True) -> List[str]: ''' Given a dataset it will return all of the targets tokenised and then re-combined with `_` to show the multiple words that make up the target if it contains multiple words. :param dataset: The dataset that contains all of the targets. :param tokeniser: The tokeniser to define if a target is made up of multiple words :param lower: if to lower case the target words. :returns: A list of targets where if they are made up of multiple words the will now have an `_` between them e.g. `tesco supermarket` would be `tesco_supermarket` ''' targets = dataset.target_set(lower=lower) tokenised_targets = [tokeniser(target) for target in targets] multi_word_targets = ['_'.join(target) for target in tokenised_targets] return multi_word_targets
def target_sentiments(dataset: TargetCollection, lower: bool = True) -> Dict[str, Set[Any]]: ''' Given a dataset will return a dictionary of targets and the sentiment that has been associated to those targets. E.g. within the dataset that `target` `camera` may have only been seen with a positive and a negative label but not neutral therefore in the returned dictionary it would be {`camera`: [`positive`, `negative`]} :param dataset: TargetCollection containing samples :param lower: Whether to lower case the target words. :returns: A dictionary where the keys are target words and the values are the sentiment values that have been associated to those targets. ''' targets_sentiments = defaultdict(set) for data in dataset.data(): target = data['target'] if lower: target = target.lower() targets_sentiments[target].add(data['sentiment']) return targets_sentiments
def similar_sentiment(test_dataset: TargetCollection, train_dataset: TargetCollection, lower: bool = True) -> List[str]: ''' Given a test and train dataset will return all of the test dataset sample ids that contain targets that have occured more than once in the train or test sets with at least some overlap between the test sentiment and train but not identical. E.g. the target `camera` could occur with `positive` and `negative` sentiment in the test set and only `negative` in the train set. :param test_dataset: Test TargetCollection :param train_dataset: Train TargetCollection :param lower: Whether to lower case the target words :returns: A list of sample ids from the test dataset. ''' train_target_sentiments = target_sentiments(train_dataset, lower) test_target_sentiments = target_sentiments(test_dataset, lower) similar_sentiments = set() for data in test_dataset.data(): target = data['target'] if lower: target = target.lower() if (target in train_target_sentiments and target in test_target_sentiments): train_sentiments = train_target_sentiments[target] test_sentiments = test_target_sentiments[target] if (len(train_sentiments) > 1 or len(test_sentiments) > 1): if train_sentiments == test_sentiments: continue if test_sentiments.intersection(train_sentiments): similar_sentiments.add(target) similar_samples = targets_to_samples(test_dataset, similar_sentiments, lower) similar_ids = [sample['target_id'] for sample in similar_samples] return similar_ids
def _predict_iter(self, data: TargetCollection ) -> Generator[Dict[str, Any], None, None]: ''' Iterates over the predictions and yields one prediction at a time. This is a useful wrapper as it performs the data pre-processing and assertion checks. :param data: Data to predict on :yields: A dictionary containing `class_probabilities` and `label`. ''' no_model_error = 'There is no model to make predictions, either fit '\ 'or load a model.' assert self.model, no_model_error self.model.eval() all_model_params = Params.from_file(self._param_fp) reader_params = all_model_params.get("dataset_reader") dataset_reader = DatasetReader.from_params(reader_params) predictor = TargetPredictor(self.model, dataset_reader) batch_size = 64 if 'iterator' in all_model_params: iter_params = all_model_params.get("iterator") if 'batch_size' in iter_params: batch_size = iter_params['batch_size'] json_data = data.data_dict() # Reference # https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks for i in range(0, len(json_data), batch_size): json_data_batch = json_data[i:i + batch_size] predictions = predictor.predict_batch_json(json_data_batch) for prediction in predictions: yield prediction
def targets_to_samples(dataset: TargetCollection, targets: Set[str], lower: bool = True) -> List[Target]: ''' Given a dataset and a set of target words, it will return a subset of the dataset where all samples in the subset have target words that are in the targets set. :param dataset: TargetCollection containing samples :param targets: A set of target words used to subset the dataset :param lower: Whether to lower case the target words. If this is True it is up to you to ensure all the words in the `targets` set have been lower cased. :returns: A subset of the dataset where all targets in the subset are within the `targets` set. ''' samples = [] for data in dataset.data(): target = data['target'] if lower: target = target.lower() if target in targets: samples.append(data) return samples
import argparse import json from pathlib import Path from bella.data_types import TargetCollection def parse_path(path_string: str) -> Path: path_string = Path(path_string).resolve() return path_string if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("tdsa_dataset_fp", type=parse_path, help="File path to the TDSA dataset") parser.add_argument("target_set_fp", type=parse_path, help='File path to save the targets to') parser.add_argument args = parser.parse_args() data = TargetCollection.load_from_json(args.tdsa_dataset_fp) all_targets = list(data.target_set()) with args.target_set_fp.open('w+') as target_set_file: json.dump(all_targets, target_set_file)
lr_dir.mkdir(parents=True, exist_ok=True) model_config_dir = args.model_config_dir # Models model_names = ['atae', 'bilinear', 'ian', 'tdlstm', 'tclstm'] bilstm_names = [''] all_model_names = product(model_names, bilstm_names) all_models = [] for model_name, bilstm_name in all_model_names: model_name = f'{model_name}{bilstm_name}'.strip() model_config_fp = Path(model_config_dir, f'{model_name}.json') all_models.append(AllenNLPModel(model_name, model_config_fp)) # Data dataset_name = args.dataset_name train = TargetCollection.load_from_json(args.train_fp) model_data = product(all_models, [train]) for model, train_data in model_data: logger.info(f'Finding learning rate for {model.name} on {dataset_name} ' f'dataset using {args.num_batches} batches') model_dir = Path(lr_dir, model.name) data_model_dir = Path(model_dir, f'{dataset_name}') if not data_model_dir.exists(): data_model_dir.mkdir(parents=True) try: model.find_learning_rate(train, data_model_dir, {'num_batches': args.num_batches}) except: error_msg = f'Finding learning rate for {model.name} on {dataset_name} ' \
'ian': 'IAN', 'tdlstm': 'TDLSTM', 'tclstm': 'TCLSTM' } dataset_name = args.dataset_name embedding_folder_names = [ 'baseline', 'baseline_lm', 'baseline_lm_embedding', 'ds_embedding', 'ds_lm', 'ds_lm_embedding', 'ds_lm_ds_embedding' ] embedding_names = [ 'Glove', 'ELMo T', 'ELMo T + Glove', 'DS Word2Vec', 'DS ELMo T', 'DS ELMo T + Glove', 'DS ELMo T + DS Glove' ] test_data_fp = args.test_data_fp test_data = TargetCollection.load_from_json(test_data_fp) dataset_name_flag = [('Test', test_data, True)] if args.val_fp: val_data = TargetCollection.load_from_json(args.val_fp) dataset_name_flag.append(('Validation', val_data, False)) y_score = [] x_model_names = [] embedding_color = [] dataset_split_names = [] embedding_folder_and_names = list( zip(embedding_folder_names, embedding_names)) for dataset_split_name, dataset, test_flag in dataset_name_flag: for embedding_folder_name, embedding_name in embedding_folder_and_names: for model_name in model_names:
similarity_field = np.array(target_data[similarity_field_name]) if is_language_model and threshold: similarity_field = target_data['original_perplexity'] - similarity_field above_original_perplexity_index = np.argmax((similarity_field <= 0) + 0) similarity_field = similarity_field[:above_original_perplexity_index] if len(similarity_field) < k_similar: temp_k = len(similarity_field) elif (not is_language_model) and threshold: above_threshold_index = np.argmin((similarity_field >= threshold) + 0) similarity_field = similarity_field[:above_threshold_index] if len(similarity_field) < k_similar: temp_k = len(similarity_field) # For each of the filtered alternative targets it creates a json # like object that will be used to store it in a collection to then # save to a json file alternative_targets = target_data['alternative_targets'][:temp_k] for index, alternative_target in enumerate(alternative_targets): new_target_id = f'{target_id}_{index}' new_target_data = exchange_targets(target_data, alternative_target, new_target_id) # sanitizing the target dataset. new_target_data.pop('alternative_targets') new_target_data.pop(similarity_field_name) if is_language_model: new_target_data.pop('original_perplexity') new_target_dataset.append(Target(**new_target_data)) print(f'Size of the expanded dataset {len(new_target_dataset)}') new_target_dataset = TargetCollection(new_target_dataset) new_target_dataset.to_json_file(str(args.dataset_save_fp),cache=False)
'tclstm': 'TCLSTM', 'atae_ds_lm_embedding': 'ATAE ELMo T' } dataset_name = args.dataset_name values_of_k = [2, 3, 5, 10] augmentation_technique = ['embedding', 'lm'] thresholding = ['no_threshold_', 'threshold_'] technique_threshold_mapper = { 'no_threshold_embedding': 'Embedding', 'threshold_embedding': 'Embedding T', 'no_threshold_lm': 'LM', 'threshold_lm': 'LM T' } test_data_fp = args.test_data_fp test_data = TargetCollection.load_from_json(test_data_fp) test = True if args.val: test = False folder_names = list( itertools.product(thresholding, augmentation_technique, values_of_k)) y_score = [] x_k = [] all_techniques = [] all_model_names = [] for threshold, tech, k in folder_names: folder_name = f'{threshold}{tech}{str(k)}'
} else: model_name_mapper = { model_name: model_name for model_name in model_names } bilstm_names = [''] all_model_names = product(model_names, bilstm_names) all_models = [] for model_name, bilstm_name in all_model_names: model_name = f'{model_name}{bilstm_name}'.strip() model_config_fp = Path(model_config_dir, f'{model_name}.json') all_models.append( AllenNLPModel(model_name_mapper[model_name], model_config_fp)) # Data path_dataset = lambda _dir, name: TargetCollection.load_from_json( Path(_dir, name)) train = path_dataset(data_dir, f'{dataset_name} Train') logger.info(f'Size of the original dataset {len(train)}') if augmented_fp is not None: augmented_data = TargetCollection.load_from_json(augmented_fp) logger.info(f'Size of the augmented dataset {len(augmented_data)}') train = TargetCollection.combine_collections(train, augmented_data) logger.info(f'Size of the training augmented dataset {len(train)}') val = path_dataset(data_dir, f'{dataset_name} Val') logger.info(f'Size of the validation set {len(val)}') test = path_dataset(data_dir, f'{dataset_name} Test') logger.info(f'Size of the test set {len(test)}') model_data = product(all_models, [(train, val, test)]) for model, data in model_data: logger.info(f'Running Model {model.name} on {dataset_name} dataset '
parser.add_argument("dataset_name", type=str, choices=['Laptop', 'Restaurant', 'Election']) parser.add_argument("--force_space", action='store_true', help=force_space_help) args = parser.parse_args() data_dir = args.data_dir dataset_name = args.dataset_name tokeniser = spacy.blank('en') split_names = ['Train', 'Val', 'Test'] for split_name in split_names: dataset_fp = Path(data_dir, f'{dataset_name} {split_name}') dataset = TargetCollection.load_from_json(dataset_fp) retrieve_target_from_tokens = [] cannot_retrieve_target_from_tokens = [] for target in dataset.data_dict(): text = target['text'] target_word = target['target'].strip() target_start_offset = target['spans'][0][0] target_end_offset = target['spans'][0][1] if args.force_space: before_text = text[:target_start_offset].strip() after_text = text[target_end_offset:].strip() new_target_word = f' {target_word} ' target_start_offset = len(before_text) + 1 target_end_offset = target_start_offset + len(target_word)
help="File path to the embedding", type=parse_path) parser.add_argument("additional_targets_fp", type=parse_path, help='File Path to additional targets') parser.add_argument("augmented_dataset_fp", type=parse_path, help=augmented_dataset_help) parser.add_argument("tokeniser", type=str, choices=tokeniser_choices) parser.add_argument("--lower", action="store_true") args = parser.parse_args() # Load tokeniser if args.tokeniser == 'spacy': tokeniser = allen_spacy_tokeniser else: raise ValueError( f'Tokeniser has to be one of the following {tokeniser_choices}') training_data = TargetCollection.load_from_json(args.train_fp) embedding = Word2Vec.load(str(args.embedding_fp)) target_related_words_sim: Dict[str, List[Tuple[str, float]]] target_related_words_sim = word_embedding_augmentation(training_data, embedding, lower=args.lower, k_nearest=-1, tokeniser=tokeniser) augmented_dataset(target_related_words_sim, training_data, args.augmented_dataset_fp, lower=args.lower)