def _get_data_reader(self):
     token_indexer = SingleIdTokenIndexer(namespace="tokens")
     bert_tokenizer = BertTokenizer.from_pretrained(
         self.bert_vocab_file_path, do_lower_case=True)
     bert_token_indexer = WordpieceIndexer(
         vocab=bert_tokenizer.vocab,
         wordpiece_tokenizer=bert_tokenizer.wordpiece_tokenizer.tokenize,
         namespace="bert",
         use_starting_offsets=False,
         max_pieces=self.max_len,
         do_lowercase=True,
         never_lowercase=None,
         start_tokens=None,
         end_tokens=None,
         separator_token="[SEP]",
         truncate_long_sequences=True)
     position_indexer = SingleIdTokenIndexer(namespace='position')
     sentence_constituency_indexer = SingleIdTokenIndexer(
         namespace='sentence_constituency')
     reader = acd_and_sc_data_reader.AcdAndScDatasetReaderConstituencyBertSingle(
         self.distinct_categories,
         self.distinct_polarities,
         tokenizer=self._get_word_segmenter(),
         token_indexers={"tokens": token_indexer},
         position_indexers={'position': position_indexer},
         configuration=self.configuration,
         # bert_tokenizer=self._get_bert_word_segmenter(),
         bert_tokenizer=bert_tokenizer,
         bert_token_indexers={"bert": bert_token_indexer},
         sentence_constituency_indexer={
             'sentence_constituency': sentence_constituency_indexer
         })
     return reader
 def _get_data_reader(self):
     token_indexer = SingleIdTokenIndexer(namespace="tokens")
     position_indexer = SingleIdTokenIndexer(namespace='position')
     bert_tokenizer = BertTokenizer.from_pretrained(self.bert_vocab_file_path, do_lower_case=True)
     bert_token_indexer = WordpieceIndexer(vocab=bert_tokenizer.vocab,
                                           wordpiece_tokenizer=bert_tokenizer.wordpiece_tokenizer.tokenize,
                                           namespace="bert",
                                           use_starting_offsets=False,
                                           max_pieces=self.max_len,
                                           do_lowercase=True,
                                           never_lowercase=None,
                                           start_tokens=None,
                                           end_tokens=None,
                                           separator_token="[SEP]",
                                           truncate_long_sequences=True)
     reader = relation_classification_data_reader.MultiRelationClassificationBertDatasetReader(
         self.distinct_polarities,
         tokenizer=self._get_word_segmenter(),
         token_indexers={"tokens": token_indexer},
         position_indexers={'position': position_indexer},
         configuration=self.configuration,
         bert_tokenizer=bert_tokenizer,
         bert_token_indexers={"bert": bert_token_indexer}
     )
     return reader
Ejemplo n.º 3
0
 def _get_data_reader(self):
     bert_tokenizer = BertTokenizer.from_pretrained(
         self.bert_vocab_file_path, do_lower_case=True)
     bert_token_indexer = WordpieceIndexer(
         vocab=bert_tokenizer.vocab,
         wordpiece_tokenizer=bert_tokenizer.wordpiece_tokenizer.tokenize,
         namespace="bert",
         use_starting_offsets=False,
         max_pieces=self.max_len,
         do_lowercase=True,
         never_lowercase=None,
         start_tokens=None,
         end_tokens=None,
         separator_token="[SEP]",
         truncate_long_sequences=True)
     if self.configuration['consider_target']:
         token_indexer = SingleIdTokenIndexer(namespace="tokens")
         position_indexer = SingleIdTokenIndexer(namespace='position')
         reader = tosc_data_reader.TextAspectInSentimentOutConsideringTargetBertForTOSC(
             self.distinct_polarities,
             tokenizer=self._get_word_segmenter(),
             token_indexers={"tokens": token_indexer},
             position_indexers={'position': position_indexer},
             configuration=self.configuration,
             bert_tokenizer=bert_tokenizer,
             bert_token_indexers={"bert": bert_token_indexer})
     else:
         token_indexer = SingleIdTokenIndexer(namespace="tokens")
         position_indexer = SingleIdTokenIndexer(namespace='position')
         reader = tosc_data_reader.TextAspectInSentimentOutBertForTOSC(
             self.distinct_polarities,
             tokenizer=self._get_word_segmenter(),
             token_indexers={"tokens": token_indexer},
             position_indexers={'position': position_indexer},
             configuration=self.configuration,
             bert_tokenizer=bert_tokenizer,
             bert_token_indexers={"bert": bert_token_indexer})
     return reader
    def _get_data_reader(self):
        bert_tokenizer = self._get_word_segmenter()
        bert_token_indexer = WordpieceIndexer(vocab=bert_tokenizer.bert_tokenizer.vocab,
                                              wordpiece_tokenizer=bert_tokenizer.bert_tokenizer.wordpiece_tokenizer.tokenize,
                                              namespace="tokens",
                                              use_starting_offsets=False,
                                              max_pieces=self.max_len,
                                              do_lowercase=True,
                                              never_lowercase=None,
                                              start_tokens=None,
                                              end_tokens=None,
                                              separator_token="[SEP]",
                                              truncate_long_sequences=True)
        position_indexer = SingleIdTokenIndexer(namespace='position')
        reader = atsa_data_reader.TextAspectInSentimentOutForSyntaxAwareBert(
            self.distinct_polarities,
            tokenizer=self._get_word_segmenter(),
            token_indexers={"tokens": bert_token_indexer},
            position_indexers={'position': position_indexer},
            configuration=self.configuration
        )

        return reader
    def _load_data(self):
        data_filepath = self.base_data_dir + 'data'
        if os.path.exists(data_filepath):
            self.train_data, self.dev_data, self.test_data, self.distinct_categories, self.distinct_polarities, \
                self.hard_test_data = super()._load_object(data_filepath)
        else:
            train_dev_test_data, distinct_categories, distinct_polarities = self.dataset. \
                generate_acd_and_sc_data()

            train_dev_test_data['hard_test'] = None
            if self.hard_dataset:
                train_dev_test_data_hard, _, _ = self.hard_dataset.generate_acd_and_sc_data(
                )
                train_dev_test_data['hard_test'] = train_dev_test_data_hard[
                    'test']

            distinct_polarities_new = []
            for polarity in distinct_polarities:
                if polarity != 'conflict':
                    distinct_polarities_new.append(polarity)
            self.distinct_categories = distinct_categories
            self.distinct_polarities = distinct_polarities_new

            token_indexer = SingleIdTokenIndexer(namespace="tokens")
            bert_tokenizer = BertTokenizer.from_pretrained(
                self.bert_vocab_file_path, do_lower_case=True)
            bert_token_indexer = WordpieceIndexer(
                vocab=bert_tokenizer.vocab,
                wordpiece_tokenizer=bert_tokenizer.wordpiece_tokenizer.
                tokenize,
                namespace="bert",
                use_starting_offsets=False,
                max_pieces=self.max_len,
                do_lowercase=True,
                never_lowercase=None,
                start_tokens=None,
                end_tokens=None,
                separator_token="[SEP]",
                truncate_long_sequences=True)
            position_indexer = SingleIdTokenIndexer(namespace='position')
            reader = acd_and_sc_data_reader.AcdAndScDatasetReaderMilSimultaneouslyBertSingle(
                self.distinct_categories,
                self.distinct_polarities,
                tokenizer=self._get_word_segmenter(),
                token_indexers={"tokens": token_indexer},
                position_indexers={'position': position_indexer},
                configuration=self.configuration,
                # bert_tokenizer=self._get_bert_word_segmenter(),
                bert_tokenizer=bert_tokenizer,
                bert_token_indexers={"bert": bert_token_indexer})
            self.data_reader = reader

            train_dev_test_data_label_indexed = {}
            for data_type, data in train_dev_test_data.items():
                if data is None:
                    continue
                data_new = []
                for sample in data:
                    sample_new = [sample[0]]
                    labels_new = []
                    for label in sample[1]:
                        aspect = label[0]
                        polarity = label[1]
                        aspect_index = distinct_categories.index(aspect)
                        if polarity == 'conflict':
                            polarity_index = -100
                        else:
                            polarity_index = distinct_polarities_new.index(
                                polarity)
                        labels_new.append((aspect_index, polarity_index))
                    if len(labels_new) != 0:
                        sample_new.append(labels_new)
                        data_new.append(sample_new)
                train_dev_test_data_label_indexed[data_type] = data_new
            self.train_data = reader.read(
                train_dev_test_data_label_indexed['train'])
            self.dev_data = reader.read(
                train_dev_test_data_label_indexed['dev'])
            self.test_data = reader.read(
                train_dev_test_data_label_indexed['test'])
            if self.hard_dataset:
                self.hard_test_data = reader.read(
                    train_dev_test_data_label_indexed['hard_test'])
            data = [
                self.train_data, self.dev_data, self.test_data,
                self.distinct_categories, self.distinct_polarities,
                self.hard_test_data
            ]
            super()._save_object(data_filepath, data)