def _get_data_reader(self): token_indexer = SingleIdTokenIndexer(namespace="tokens") bert_tokenizer = BertTokenizer.from_pretrained( self.bert_vocab_file_path, do_lower_case=True) bert_token_indexer = WordpieceIndexer( vocab=bert_tokenizer.vocab, wordpiece_tokenizer=bert_tokenizer.wordpiece_tokenizer.tokenize, namespace="bert", use_starting_offsets=False, max_pieces=self.max_len, do_lowercase=True, never_lowercase=None, start_tokens=None, end_tokens=None, separator_token="[SEP]", truncate_long_sequences=True) position_indexer = SingleIdTokenIndexer(namespace='position') sentence_constituency_indexer = SingleIdTokenIndexer( namespace='sentence_constituency') reader = acd_and_sc_data_reader.AcdAndScDatasetReaderConstituencyBertSingle( self.distinct_categories, self.distinct_polarities, tokenizer=self._get_word_segmenter(), token_indexers={"tokens": token_indexer}, position_indexers={'position': position_indexer}, configuration=self.configuration, # bert_tokenizer=self._get_bert_word_segmenter(), bert_tokenizer=bert_tokenizer, bert_token_indexers={"bert": bert_token_indexer}, sentence_constituency_indexer={ 'sentence_constituency': sentence_constituency_indexer }) return reader
def _get_data_reader(self): token_indexer = SingleIdTokenIndexer(namespace="tokens") position_indexer = SingleIdTokenIndexer(namespace='position') bert_tokenizer = BertTokenizer.from_pretrained(self.bert_vocab_file_path, do_lower_case=True) bert_token_indexer = WordpieceIndexer(vocab=bert_tokenizer.vocab, wordpiece_tokenizer=bert_tokenizer.wordpiece_tokenizer.tokenize, namespace="bert", use_starting_offsets=False, max_pieces=self.max_len, do_lowercase=True, never_lowercase=None, start_tokens=None, end_tokens=None, separator_token="[SEP]", truncate_long_sequences=True) reader = relation_classification_data_reader.MultiRelationClassificationBertDatasetReader( self.distinct_polarities, tokenizer=self._get_word_segmenter(), token_indexers={"tokens": token_indexer}, position_indexers={'position': position_indexer}, configuration=self.configuration, bert_tokenizer=bert_tokenizer, bert_token_indexers={"bert": bert_token_indexer} ) return reader
def _get_data_reader(self): bert_tokenizer = BertTokenizer.from_pretrained( self.bert_vocab_file_path, do_lower_case=True) bert_token_indexer = WordpieceIndexer( vocab=bert_tokenizer.vocab, wordpiece_tokenizer=bert_tokenizer.wordpiece_tokenizer.tokenize, namespace="bert", use_starting_offsets=False, max_pieces=self.max_len, do_lowercase=True, never_lowercase=None, start_tokens=None, end_tokens=None, separator_token="[SEP]", truncate_long_sequences=True) if self.configuration['consider_target']: token_indexer = SingleIdTokenIndexer(namespace="tokens") position_indexer = SingleIdTokenIndexer(namespace='position') reader = tosc_data_reader.TextAspectInSentimentOutConsideringTargetBertForTOSC( self.distinct_polarities, tokenizer=self._get_word_segmenter(), token_indexers={"tokens": token_indexer}, position_indexers={'position': position_indexer}, configuration=self.configuration, bert_tokenizer=bert_tokenizer, bert_token_indexers={"bert": bert_token_indexer}) else: token_indexer = SingleIdTokenIndexer(namespace="tokens") position_indexer = SingleIdTokenIndexer(namespace='position') reader = tosc_data_reader.TextAspectInSentimentOutBertForTOSC( self.distinct_polarities, tokenizer=self._get_word_segmenter(), token_indexers={"tokens": token_indexer}, position_indexers={'position': position_indexer}, configuration=self.configuration, bert_tokenizer=bert_tokenizer, bert_token_indexers={"bert": bert_token_indexer}) return reader
def _get_data_reader(self): bert_tokenizer = self._get_word_segmenter() bert_token_indexer = WordpieceIndexer(vocab=bert_tokenizer.bert_tokenizer.vocab, wordpiece_tokenizer=bert_tokenizer.bert_tokenizer.wordpiece_tokenizer.tokenize, namespace="tokens", use_starting_offsets=False, max_pieces=self.max_len, do_lowercase=True, never_lowercase=None, start_tokens=None, end_tokens=None, separator_token="[SEP]", truncate_long_sequences=True) position_indexer = SingleIdTokenIndexer(namespace='position') reader = atsa_data_reader.TextAspectInSentimentOutForSyntaxAwareBert( self.distinct_polarities, tokenizer=self._get_word_segmenter(), token_indexers={"tokens": bert_token_indexer}, position_indexers={'position': position_indexer}, configuration=self.configuration ) return reader
def _load_data(self): data_filepath = self.base_data_dir + 'data' if os.path.exists(data_filepath): self.train_data, self.dev_data, self.test_data, self.distinct_categories, self.distinct_polarities, \ self.hard_test_data = super()._load_object(data_filepath) else: train_dev_test_data, distinct_categories, distinct_polarities = self.dataset. \ generate_acd_and_sc_data() train_dev_test_data['hard_test'] = None if self.hard_dataset: train_dev_test_data_hard, _, _ = self.hard_dataset.generate_acd_and_sc_data( ) train_dev_test_data['hard_test'] = train_dev_test_data_hard[ 'test'] distinct_polarities_new = [] for polarity in distinct_polarities: if polarity != 'conflict': distinct_polarities_new.append(polarity) self.distinct_categories = distinct_categories self.distinct_polarities = distinct_polarities_new token_indexer = SingleIdTokenIndexer(namespace="tokens") bert_tokenizer = BertTokenizer.from_pretrained( self.bert_vocab_file_path, do_lower_case=True) bert_token_indexer = WordpieceIndexer( vocab=bert_tokenizer.vocab, wordpiece_tokenizer=bert_tokenizer.wordpiece_tokenizer. tokenize, namespace="bert", use_starting_offsets=False, max_pieces=self.max_len, do_lowercase=True, never_lowercase=None, start_tokens=None, end_tokens=None, separator_token="[SEP]", truncate_long_sequences=True) position_indexer = SingleIdTokenIndexer(namespace='position') reader = acd_and_sc_data_reader.AcdAndScDatasetReaderMilSimultaneouslyBertSingle( self.distinct_categories, self.distinct_polarities, tokenizer=self._get_word_segmenter(), token_indexers={"tokens": token_indexer}, position_indexers={'position': position_indexer}, configuration=self.configuration, # bert_tokenizer=self._get_bert_word_segmenter(), bert_tokenizer=bert_tokenizer, bert_token_indexers={"bert": bert_token_indexer}) self.data_reader = reader train_dev_test_data_label_indexed = {} for data_type, data in train_dev_test_data.items(): if data is None: continue data_new = [] for sample in data: sample_new = [sample[0]] labels_new = [] for label in sample[1]: aspect = label[0] polarity = label[1] aspect_index = distinct_categories.index(aspect) if polarity == 'conflict': polarity_index = -100 else: polarity_index = distinct_polarities_new.index( polarity) labels_new.append((aspect_index, polarity_index)) if len(labels_new) != 0: sample_new.append(labels_new) data_new.append(sample_new) train_dev_test_data_label_indexed[data_type] = data_new self.train_data = reader.read( train_dev_test_data_label_indexed['train']) self.dev_data = reader.read( train_dev_test_data_label_indexed['dev']) self.test_data = reader.read( train_dev_test_data_label_indexed['test']) if self.hard_dataset: self.hard_test_data = reader.read( train_dev_test_data_label_indexed['hard_test']) data = [ self.train_data, self.dev_data, self.test_data, self.distinct_categories, self.distinct_polarities, self.hard_test_data ] super()._save_object(data_filepath, data)