def initialize(self, resources: Resources, configs: Config): self.resource = resources self.config = configs self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") if "name" in self.config.tokenizer: self.tokenizer = BERTTokenizer( pretrained_model_name=self.config.tokenizer.name) if "name" in self.config.model: self.encoder = BERTEncoder( pretrained_model_name=self.config.model.name) else: self.encoder = BERTEncoder( pretrained_model_name=None, hparams={"pretrained_model_name": None}, ) with open(self.config.model.path, "rb") as f: state_dict = pickle.load(f) self.encoder.load_state_dict(state_dict["bert"]) self.encoder.to(self.device)
def initialize(self, resources: Resources, configs: HParams): self.resource = resources vocab_file = configs.vocab_file self.tokenizer = BERTTokenizer.load(vocab_file) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.encoder = BERTEncoder(pretrained_model_name="bert-base-uncased") self.encoder.to(self.device)
class QueryCreator(MultiPackProcessor): r"""This processor is used to search for relevant documents for a query """ # pylint: disable=useless-super-delegation def __init__(self) -> None: super().__init__() def initialize(self, resources: Resources, configs: HParams): self.resource = resources vocab_file = configs.vocab_file self.tokenizer = BERTTokenizer.load(vocab_file) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.encoder = BERTEncoder(pretrained_model_name="bert-base-uncased") self.encoder.to(self.device) @torch.no_grad() def get_embeddings(self, input_ids, segment_ids): return self.encoder(inputs=input_ids, segment_ids=segment_ids) def _process(self, input_pack: MultiPack): input_ids = [] segment_ids = [] query_pack = input_pack.get_pack("pack") context = [query_pack.text] # use context to build the query if "user_utterance" in input_pack.pack_names: user_pack = input_pack.get_pack("user_utterance") context.append(user_pack.text) if "bot_utterance" in input_pack.pack_names: bot_pack = input_pack.get_pack("bot_utterance") context.append(bot_pack.text) for text in context: t = self.tokenizer.encode_text(text) input_ids.append(t[0]) segment_ids.append(t[1]) input_ids = torch.LongTensor(input_ids).to(self.device) segment_ids = torch.LongTensor(segment_ids).to(self.device) _, query_vector = self.get_embeddings(input_ids, segment_ids) query_vector = torch.mean(query_vector, dim=0, keepdim=True) query_vector = query_vector.cpu().numpy() query = Query(pack=query_pack, value=query_vector) query_pack.add_or_get_entry(query)
def _load_rep_embedding(): device = 'cuda:1' max_rep_length = 28 train_original_rep, validation_original_rep, test_original_rep = PreprocessTool._load_rep( ) train_original_rep_embed_dict = dict() validation_original_rep_embed_dict = dict() test_original_rep_embed_dict = dict() tokenizer = tx.data.BERTTokenizer( pretrained_model_name='bert-base-uncased') bert_encoder = BERTEncoder( pretrained_model_name='bert-base-uncased').to(device) for mode in ['train', 'validation', 'test']: if mode == 'train': original_rep = train_original_rep elif mode == 'validation': original_rep = validation_original_rep elif mode == 'test': original_rep = test_original_rep original_rep_embed_dict = dict() for _, item in tqdm( enumerate(original_rep), desc="loading {}'s rep embedding".format(mode)): item_cmpat = tx.utils.compat_as_text(item) input_ids, segment_ids, input_mask = tokenizer.encode_text( text_a=item_cmpat, max_seq_length=max_rep_length) input_ids = torch.Tensor(input_ids).to(device).unsqueeze( 0).long() segment_ids = torch.Tensor(segment_ids).to(device).unsqueeze( 0).long() input_mask = torch.Tensor(input_mask).to(device).unsqueeze( 0).long() input_length = (1 - (input_ids == 0).int()).sum(dim=1) _, item_bert_embs = bert_encoder( inputs=input_ids[:, 1:], sequence_length=input_length - 1, ) original_rep_embed_dict[item] = np.array( item_bert_embs.squeeze(0).data.cpu()).tolist() if mode == 'train': train_original_rep_embed_dict = original_rep_embed_dict save_file = '../tools/train_original_rep_embed.txt' elif mode == 'validation': validation_original_rep_embed_dict = original_rep_embed_dict save_file = '../tools/validation_original_rep_embed.txt' elif mode == 'test': test_original_rep_embed_dict = original_rep_embed_dict save_file = '../tools/test_original_rep_embed.txt' PreprocessTool._save_rep_embed(original_rep_embed_dict, save_file) PreprocessTool.print_save_file(save_file) return train_original_rep_embed_dict, validation_original_rep_embed_dict, test_original_rep_embed_dict
def __init__(self, args, model_config, data_config, embedding_init_value, device): super().__init__() self.config_model = model_config self.config_data = data_config self.vocab = tx.data.Vocab(self.config_data.vocab_file) self.bert_encoder = BERTEncoder(hparams=self.config_model.bert_encoder) self.linear0_1 = MLPTransformConnector(linear_layer_dim=300 * (16 + 2), output_size=300) self.linear0_2 = MLPTransformConnector(linear_layer_dim=300 * (16 + 2), output_size=300) self.linear0_3 = MLPTransformConnector(linear_layer_dim=300 * (16 + 2), output_size=300) self.linear1 = MLPTransformConnector(linear_layer_dim=300, output_size=512) self.linear2_1 = MLPTransformConnector(linear_layer_dim=768, output_size=512) self.linear2_2 = MLPTransformConnector(linear_layer_dim=768, output_size=512) self.linear3 = MLPTransformConnector(linear_layer_dim=1024, output_size=512) self.linear4_1 = MLPTransformConnector(linear_layer_dim=512, output_size=128) self.linear4_2 = MLPTransformConnector(linear_layer_dim=512, output_size=128) self.linear5 = MLPTransformConnector(linear_layer_dim=128, output_size=1) self.word_embedder = WordEmbedder( vocab_size=self.vocab.size, init_value=embedding_init_value(1).word_vecs, hparams=self.config_model.word_embedder_300) self.gat_1 = GATLayer(in_features=self.config_model.dim_c_300, out_features=self.config_model.dim_c_300, alpha=0.2, nheads=4, activation=False, device=device) self.gat_2 = GATLayer(in_features=self.config_model.dim_c_300, out_features=self.config_model.dim_c_300, alpha=0.2, nheads=4, activation=False, device=device) self.gat_3 = GATLayer(in_features=self.config_model.dim_c_300, out_features=self.config_model.dim_c_300, alpha=0.2, nheads=4, activation=False, device=device) self.hinge = torch.nn.MarginRankingLoss(reduction='none', margin=0.1)
class BertBasedQueryCreator(QueryProcessor): r"""This processor searches relevant documents for a query""" # pylint: disable=useless-super-delegation def __init__(self) -> None: super().__init__() def initialize(self, resources: Resources, configs: Config): self.resource = resources self.config = configs self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if "name" in self.config.tokenizer: self.tokenizer = \ BERTTokenizer(pretrained_model_name=self.config.tokenizer.name) if "name" in self.config.model: self.encoder = BERTEncoder( pretrained_model_name=self.config.model.name) else: self.encoder = BERTEncoder(pretrained_model_name=None, hparams={"pretrained_model_name": None}) with open(self.config.model.path, "rb") as f: state_dict = pickle.load(f) self.encoder.load_state_dict(state_dict["bert"]) self.encoder.to(self.device) @classmethod def default_configs(cls) -> Dict[str, Any]: config = super().default_configs() config.update({ "model": { 'path': None, "name": "bert-base-uncased", }, "tokenizer": { "name": "bert-base-uncased" }, "max_seq_length": 128, "query_pack_name": "query" }) return config @torch.no_grad() def get_embeddings(self, inputs, sequence_length, segment_ids): output, _ = self.encoder(inputs=inputs, sequence_length=sequence_length, segment_ids=segment_ids) cls_token = output[:, 0, :] return cls_token def _build_query(self, text: str) -> np.ndarray: input_ids, segment_ids, input_mask = \ self.tokenizer.encode_text( text_a=text, max_seq_length=self.config.max_seq_length) input_ids = torch.LongTensor(input_ids).unsqueeze(0).to(self.device) segment_ids = torch.LongTensor(segment_ids).unsqueeze(0).to(self.device) input_mask = torch.LongTensor(input_mask).unsqueeze(0).to(self.device) sequence_length = (~(input_mask == 0)).sum(dim=1) query_vector = self.get_embeddings(inputs=input_ids, sequence_length=sequence_length, segment_ids=segment_ids) query_vector = torch.mean(query_vector, dim=0, keepdim=True) query_vector = query_vector.cpu().numpy() return query_vector def _process_query(self, input_pack: MultiPack) \ -> Tuple[DataPack, Dict[str, Any]]: query_pack: DataPack = input_pack.get_pack(self.config.query_pack_name) context = [query_pack.text] # use context to build the query if "user_utterance" in input_pack.pack_names: user_pack = input_pack.get_pack("user_utterance") context.append(user_pack.text) if "bot_utterance" in input_pack.pack_names: bot_pack = input_pack.get_pack("bot_utterance") context.append(bot_pack.text) text = ' '.join(context) query_vector = self._build_query(text=text) return query_pack, query_vector
processor_class = get_processor_class("IMDB") imdb_processor = processor_class("data/IMDB") train_examples = imdb_processor.get_train_examples() dev_examples = imdb_processor.get_dev_examples() reviews = [ example.text_a for dataset in [train_examples, dev_examples] for example in dataset ] # create a BERT tokenizer vocab_file = "data/pretrained_models/uncased_L-12_H-768_A-12/vocab.txt" tokenizer = BERTTokenizer.load(vocab_file) # BERT encoder device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = BERTEncoder(pretrained_model_name="bert-base-uncased") encoder.to(device) print(f"Encoding the text using BERT Tokenizer...") feature_original_types = { "id": ["int64", "FixedLenFeature"], "input_ids": ["int64", "FixedLenFeature", max_seq_length], "segment_ids": ["int64", "FixedLenFeature", max_seq_length], "text": ["str", "FixedLenFeature"] } with RecordData.writer("data/imdb.pkl", feature_original_types) as writer: for idx, review in enumerate(reviews): review = review[:tokenizer.max_len] input_ids, segment_ids, _ = tokenizer.encode_text(text_a=review) feature = {
def main(): config = yaml.safe_load(open("config.yml", "r")) config = HParams(config, default_hparams=None) if not os.path.exists(config.indexer.model_dir): print(f"Creating a new index...") encoder = BERTEncoder(pretrained_model_name="bert-base-uncased") encoder.to(device) feature_original_types = { "id": ["int64", "FixedLenFeature"], "input_ids": ["int64", "FixedLenFeature", config.indexer.max_seq_length], "segment_ids": ["int64", "FixedLenFeature", config.indexer.max_seq_length], "text": ["str", "FixedLenFeature"] } hparam = { "allow_smaller_final_batch": True, "batch_size": config.indexer.batch_size, "dataset": { "data_name": "data", "feature_original_types": feature_original_types, "files": config.indexer.pickle_data_dir }, "shuffle": False } print(f"Embedding the text using BERTEncoder...") record_data = RecordData(hparams=hparam, device=device) data_iterator = DataIterator(record_data) index = EmbeddingBasedIndexer(hparams={ "index_type": "GpuIndexFlatIP", "dim": 768, "device": "gpu0" }) for idx, batch in enumerate(data_iterator): ids = batch["id"] input_ids = batch["input_ids"] segment_ids = batch["segment_ids"] text = batch["text"] _, pooled_output = get_embeddings(encoder, input_ids, segment_ids) index.add(vectors=pooled_output, meta_data={k.item(): v for k, v in zip(ids, text)}) if (idx + 1) % 50 == 0: print(f"Completed {idx+1} batches of size " f"{config.indexer.batch_size}") index.save(path=config.indexer.model_dir) resource = Resources() query_pipeline = Pipeline(resource=resource) query_pipeline.set_reader(MultiPackTerminalReader()) query_pipeline.add_processor( processor=MachineTranslationProcessor(), config=config.translator) query_pipeline.add_processor( processor=QueryCreator(), config=config.query_creator) query_pipeline.add_processor( processor=SearchProcessor(), config=config.indexer) query_pipeline.add_processor( processor=NLTKSentenceSegmenter(), selector=NameMatchSelector(select_name="doc_0")) query_pipeline.add_processor( processor=NLTKWordTokenizer(), selector=NameMatchSelector(select_name="doc_0")) query_pipeline.add_processor( processor=NLTKPOSTagger(), selector=NameMatchSelector(select_name="doc_0")) query_pipeline.add_processor( processor=SRLPredictor(), config=config.SRL, selector=NameMatchSelector(select_name="doc_0")) # query_pipeline.add_processor( # processor=CoNLLNERPredictor(), config=config.NER, # selector=NameMatchSelector(select_name="doc_0")) query_pipeline.add_processor( processor=MachineTranslationProcessor(), config=config.back_translator) query_pipeline.initialize() for m_pack in query_pipeline.process_dataset(): # update resource to be used in the next conversation query_pack = m_pack.get_pack("query") if resource.get("user_utterance"): resource.get("user_utterance").append(query_pack) else: resource.update(user_utterance=[query_pack]) response_pack = m_pack.get_pack("response") if resource.get("bot_utterance"): resource.get("bot_utterance").append(response_pack) else: resource.update(bot_utterance=[response_pack]) english_pack = m_pack.get_pack("pack") print(colored("English Translation of the query: ", "green"), english_pack.text, "\n") pack = m_pack.get_pack("doc_0") print(colored("Retrieved Document", "green"), pack.text, "\n") print(colored("German Translation", "green"), m_pack.get_pack("response").text, "\n") for sentence in pack.get(Sentence): sent_text = sentence.text print(colored("Sentence:", 'red'), sent_text, "\n") print(colored("Semantic role labels:", 'red')) for link in pack.get(PredicateLink, sentence): parent = link.get_parent() child = link.get_child() print(f" - \"{child.text}\" is role {link.arg_type} of " f"predicate \"{parent.text}\"") print() input(colored("Press ENTER to continue...\n", 'green'))