Example #1
0
    def initialize(self, resources: Resources, configs: Config):
        self.resource = resources
        self.config = configs

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        if "name" in self.config.tokenizer:
            self.tokenizer = BERTTokenizer(
                pretrained_model_name=self.config.tokenizer.name)

        if "name" in self.config.model:
            self.encoder = BERTEncoder(
                pretrained_model_name=self.config.model.name)

        else:
            self.encoder = BERTEncoder(
                pretrained_model_name=None,
                hparams={"pretrained_model_name": None},
            )
            with open(self.config.model.path, "rb") as f:
                state_dict = pickle.load(f)
            self.encoder.load_state_dict(state_dict["bert"])

        self.encoder.to(self.device)
Example #2
0
    def initialize(self, resources: Resources, configs: HParams):
        self.resource = resources
        vocab_file = configs.vocab_file
        self.tokenizer = BERTTokenizer.load(vocab_file)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.encoder = BERTEncoder(pretrained_model_name="bert-base-uncased")
        self.encoder.to(self.device)
Example #3
0
class QueryCreator(MultiPackProcessor):
    r"""This processor is used to search for relevant documents for a query
    """

    # pylint: disable=useless-super-delegation
    def __init__(self) -> None:
        super().__init__()

    def initialize(self, resources: Resources, configs: HParams):
        self.resource = resources
        vocab_file = configs.vocab_file
        self.tokenizer = BERTTokenizer.load(vocab_file)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.encoder = BERTEncoder(pretrained_model_name="bert-base-uncased")
        self.encoder.to(self.device)

    @torch.no_grad()
    def get_embeddings(self, input_ids, segment_ids):
        return self.encoder(inputs=input_ids, segment_ids=segment_ids)

    def _process(self, input_pack: MultiPack):
        input_ids = []
        segment_ids = []

        query_pack = input_pack.get_pack("pack")
        context = [query_pack.text]

        # use context to build the query
        if "user_utterance" in input_pack.pack_names:
            user_pack = input_pack.get_pack("user_utterance")
            context.append(user_pack.text)

        if "bot_utterance" in input_pack.pack_names:
            bot_pack = input_pack.get_pack("bot_utterance")
            context.append(bot_pack.text)

        for text in context:
            t = self.tokenizer.encode_text(text)
            input_ids.append(t[0])
            segment_ids.append(t[1])

        input_ids = torch.LongTensor(input_ids).to(self.device)
        segment_ids = torch.LongTensor(segment_ids).to(self.device)
        _, query_vector = self.get_embeddings(input_ids, segment_ids)
        query_vector = torch.mean(query_vector, dim=0, keepdim=True)
        query_vector = query_vector.cpu().numpy()
        query = Query(pack=query_pack, value=query_vector)
        query_pack.add_or_get_entry(query)
Example #4
0
    def _load_rep_embedding():
        device = 'cuda:1'
        max_rep_length = 28
        train_original_rep, validation_original_rep, test_original_rep = PreprocessTool._load_rep(
        )
        train_original_rep_embed_dict = dict()
        validation_original_rep_embed_dict = dict()
        test_original_rep_embed_dict = dict()

        tokenizer = tx.data.BERTTokenizer(
            pretrained_model_name='bert-base-uncased')
        bert_encoder = BERTEncoder(
            pretrained_model_name='bert-base-uncased').to(device)

        for mode in ['train', 'validation', 'test']:
            if mode == 'train':
                original_rep = train_original_rep
            elif mode == 'validation':
                original_rep = validation_original_rep
            elif mode == 'test':
                original_rep = test_original_rep
            original_rep_embed_dict = dict()

            for _, item in tqdm(
                    enumerate(original_rep),
                    desc="loading {}'s rep embedding".format(mode)):
                item_cmpat = tx.utils.compat_as_text(item)
                input_ids, segment_ids, input_mask = tokenizer.encode_text(
                    text_a=item_cmpat, max_seq_length=max_rep_length)
                input_ids = torch.Tensor(input_ids).to(device).unsqueeze(
                    0).long()
                segment_ids = torch.Tensor(segment_ids).to(device).unsqueeze(
                    0).long()
                input_mask = torch.Tensor(input_mask).to(device).unsqueeze(
                    0).long()

                input_length = (1 - (input_ids == 0).int()).sum(dim=1)

                _, item_bert_embs = bert_encoder(
                    inputs=input_ids[:, 1:],
                    sequence_length=input_length - 1,
                )

                original_rep_embed_dict[item] = np.array(
                    item_bert_embs.squeeze(0).data.cpu()).tolist()

            if mode == 'train':
                train_original_rep_embed_dict = original_rep_embed_dict
                save_file = '../tools/train_original_rep_embed.txt'
            elif mode == 'validation':
                validation_original_rep_embed_dict = original_rep_embed_dict
                save_file = '../tools/validation_original_rep_embed.txt'
            elif mode == 'test':
                test_original_rep_embed_dict = original_rep_embed_dict
                save_file = '../tools/test_original_rep_embed.txt'

            PreprocessTool._save_rep_embed(original_rep_embed_dict, save_file)
            PreprocessTool.print_save_file(save_file)

        return train_original_rep_embed_dict, validation_original_rep_embed_dict, test_original_rep_embed_dict
Example #5
0
    def __init__(self, args, model_config, data_config, embedding_init_value,
                 device):
        super().__init__()

        self.config_model = model_config
        self.config_data = data_config
        self.vocab = tx.data.Vocab(self.config_data.vocab_file)

        self.bert_encoder = BERTEncoder(hparams=self.config_model.bert_encoder)

        self.linear0_1 = MLPTransformConnector(linear_layer_dim=300 * (16 + 2),
                                               output_size=300)
        self.linear0_2 = MLPTransformConnector(linear_layer_dim=300 * (16 + 2),
                                               output_size=300)
        self.linear0_3 = MLPTransformConnector(linear_layer_dim=300 * (16 + 2),
                                               output_size=300)

        self.linear1 = MLPTransformConnector(linear_layer_dim=300,
                                             output_size=512)
        self.linear2_1 = MLPTransformConnector(linear_layer_dim=768,
                                               output_size=512)
        self.linear2_2 = MLPTransformConnector(linear_layer_dim=768,
                                               output_size=512)
        self.linear3 = MLPTransformConnector(linear_layer_dim=1024,
                                             output_size=512)
        self.linear4_1 = MLPTransformConnector(linear_layer_dim=512,
                                               output_size=128)
        self.linear4_2 = MLPTransformConnector(linear_layer_dim=512,
                                               output_size=128)
        self.linear5 = MLPTransformConnector(linear_layer_dim=128,
                                             output_size=1)

        self.word_embedder = WordEmbedder(
            vocab_size=self.vocab.size,
            init_value=embedding_init_value(1).word_vecs,
            hparams=self.config_model.word_embedder_300)

        self.gat_1 = GATLayer(in_features=self.config_model.dim_c_300,
                              out_features=self.config_model.dim_c_300,
                              alpha=0.2,
                              nheads=4,
                              activation=False,
                              device=device)
        self.gat_2 = GATLayer(in_features=self.config_model.dim_c_300,
                              out_features=self.config_model.dim_c_300,
                              alpha=0.2,
                              nheads=4,
                              activation=False,
                              device=device)
        self.gat_3 = GATLayer(in_features=self.config_model.dim_c_300,
                              out_features=self.config_model.dim_c_300,
                              alpha=0.2,
                              nheads=4,
                              activation=False,
                              device=device)

        self.hinge = torch.nn.MarginRankingLoss(reduction='none', margin=0.1)
Example #6
0
class BertBasedQueryCreator(QueryProcessor):
    r"""This processor searches relevant documents for a query"""

    # pylint: disable=useless-super-delegation
    def __init__(self) -> None:
        super().__init__()

    def initialize(self, resources: Resources, configs: Config):
        self.resource = resources
        self.config = configs

        self.device = torch.device("cuda" if torch.cuda.is_available()
                                   else "cpu")

        if "name" in self.config.tokenizer:
            self.tokenizer = \
                BERTTokenizer(pretrained_model_name=self.config.tokenizer.name)

        if "name" in self.config.model:
            self.encoder = BERTEncoder(
                pretrained_model_name=self.config.model.name)

        else:
            self.encoder = BERTEncoder(pretrained_model_name=None,
                                       hparams={"pretrained_model_name": None})
            with open(self.config.model.path, "rb") as f:
                state_dict = pickle.load(f)
            self.encoder.load_state_dict(state_dict["bert"])

        self.encoder.to(self.device)

    @classmethod
    def default_configs(cls) -> Dict[str, Any]:
        config = super().default_configs()
        config.update({
            "model": {
                'path': None,
                "name": "bert-base-uncased",
            },
            "tokenizer": {
                "name": "bert-base-uncased"
            },
            "max_seq_length": 128,
            "query_pack_name": "query"
        })
        return config

    @torch.no_grad()
    def get_embeddings(self, inputs, sequence_length, segment_ids):
        output, _ = self.encoder(inputs=inputs,
                                 sequence_length=sequence_length,
                                 segment_ids=segment_ids)
        cls_token = output[:, 0, :]

        return cls_token

    def _build_query(self, text: str) -> np.ndarray:
        input_ids, segment_ids, input_mask = \
            self.tokenizer.encode_text(
                text_a=text, max_seq_length=self.config.max_seq_length)
        input_ids = torch.LongTensor(input_ids).unsqueeze(0).to(self.device)
        segment_ids = torch.LongTensor(segment_ids).unsqueeze(0).to(self.device)
        input_mask = torch.LongTensor(input_mask).unsqueeze(0).to(self.device)
        sequence_length = (~(input_mask == 0)).sum(dim=1)
        query_vector = self.get_embeddings(inputs=input_ids,
                                           sequence_length=sequence_length,
                                           segment_ids=segment_ids)
        query_vector = torch.mean(query_vector, dim=0, keepdim=True)
        query_vector = query_vector.cpu().numpy()
        return query_vector

    def _process_query(self, input_pack: MultiPack) \
            -> Tuple[DataPack, Dict[str, Any]]:
        query_pack: DataPack = input_pack.get_pack(self.config.query_pack_name)
        context = [query_pack.text]

        # use context to build the query
        if "user_utterance" in input_pack.pack_names:
            user_pack = input_pack.get_pack("user_utterance")
            context.append(user_pack.text)

        if "bot_utterance" in input_pack.pack_names:
            bot_pack = input_pack.get_pack("bot_utterance")
            context.append(bot_pack.text)

        text = ' '.join(context)

        query_vector = self._build_query(text=text)

        return query_pack, query_vector
Example #7
0
processor_class = get_processor_class("IMDB")
imdb_processor = processor_class("data/IMDB")
train_examples = imdb_processor.get_train_examples()
dev_examples = imdb_processor.get_dev_examples()
reviews = [
    example.text_a for dataset in [train_examples, dev_examples]
    for example in dataset
]

# create a BERT tokenizer
vocab_file = "data/pretrained_models/uncased_L-12_H-768_A-12/vocab.txt"
tokenizer = BERTTokenizer.load(vocab_file)

# BERT encoder
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = BERTEncoder(pretrained_model_name="bert-base-uncased")
encoder.to(device)

print(f"Encoding the text using BERT Tokenizer...")
feature_original_types = {
    "id": ["int64", "FixedLenFeature"],
    "input_ids": ["int64", "FixedLenFeature", max_seq_length],
    "segment_ids": ["int64", "FixedLenFeature", max_seq_length],
    "text": ["str", "FixedLenFeature"]
}

with RecordData.writer("data/imdb.pkl", feature_original_types) as writer:
    for idx, review in enumerate(reviews):
        review = review[:tokenizer.max_len]
        input_ids, segment_ids, _ = tokenizer.encode_text(text_a=review)
        feature = {
Example #8
0
def main():

    config = yaml.safe_load(open("config.yml", "r"))
    config = HParams(config, default_hparams=None)

    if not os.path.exists(config.indexer.model_dir):
        print(f"Creating a new index...")
        encoder = BERTEncoder(pretrained_model_name="bert-base-uncased")
        encoder.to(device)

        feature_original_types = {
            "id": ["int64", "FixedLenFeature"],
            "input_ids": ["int64", "FixedLenFeature",
                          config.indexer.max_seq_length],
            "segment_ids": ["int64", "FixedLenFeature",
                            config.indexer.max_seq_length],
            "text": ["str", "FixedLenFeature"]
        }

        hparam = {
            "allow_smaller_final_batch": True,
            "batch_size": config.indexer.batch_size,
            "dataset": {
                "data_name": "data",
                "feature_original_types": feature_original_types,
                "files": config.indexer.pickle_data_dir
            },
            "shuffle": False
        }

        print(f"Embedding the text using BERTEncoder...")
        record_data = RecordData(hparams=hparam, device=device)
        data_iterator = DataIterator(record_data)
        index = EmbeddingBasedIndexer(hparams={
            "index_type": "GpuIndexFlatIP",
            "dim": 768,
            "device": "gpu0"
        })

        for idx, batch in enumerate(data_iterator):
            ids = batch["id"]
            input_ids = batch["input_ids"]
            segment_ids = batch["segment_ids"]
            text = batch["text"]
            _, pooled_output = get_embeddings(encoder, input_ids, segment_ids)
            index.add(vectors=pooled_output,
                      meta_data={k.item(): v for k, v in zip(ids, text)})

            if (idx + 1) % 50 == 0:
                print(f"Completed {idx+1} batches of size "
                      f"{config.indexer.batch_size}")

        index.save(path=config.indexer.model_dir)

    resource = Resources()
    query_pipeline = Pipeline(resource=resource)
    query_pipeline.set_reader(MultiPackTerminalReader())

    query_pipeline.add_processor(
        processor=MachineTranslationProcessor(), config=config.translator)
    query_pipeline.add_processor(
        processor=QueryCreator(), config=config.query_creator)
    query_pipeline.add_processor(
        processor=SearchProcessor(), config=config.indexer)
    query_pipeline.add_processor(
        processor=NLTKSentenceSegmenter(),
        selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=NLTKWordTokenizer(),
        selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=NLTKPOSTagger(),
        selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=SRLPredictor(), config=config.SRL,
        selector=NameMatchSelector(select_name="doc_0"))
    # query_pipeline.add_processor(
    #    processor=CoNLLNERPredictor(), config=config.NER,
    #    selector=NameMatchSelector(select_name="doc_0"))
    query_pipeline.add_processor(
        processor=MachineTranslationProcessor(), config=config.back_translator)

    query_pipeline.initialize()

    for m_pack in query_pipeline.process_dataset():

        # update resource to be used in the next conversation
        query_pack = m_pack.get_pack("query")
        if resource.get("user_utterance"):
            resource.get("user_utterance").append(query_pack)
        else:
            resource.update(user_utterance=[query_pack])

        response_pack = m_pack.get_pack("response")

        if resource.get("bot_utterance"):
            resource.get("bot_utterance").append(response_pack)
        else:
            resource.update(bot_utterance=[response_pack])

        english_pack = m_pack.get_pack("pack")
        print(colored("English Translation of the query: ", "green"),
              english_pack.text, "\n")
        pack = m_pack.get_pack("doc_0")
        print(colored("Retrieved Document", "green"), pack.text, "\n")
        print(colored("German Translation", "green"),
              m_pack.get_pack("response").text, "\n")
        for sentence in pack.get(Sentence):
            sent_text = sentence.text
            print(colored("Sentence:", 'red'), sent_text, "\n")

            print(colored("Semantic role labels:", 'red'))
            for link in pack.get(PredicateLink, sentence):
                parent = link.get_parent()
                child = link.get_child()
                print(f"  - \"{child.text}\" is role {link.arg_type} of "
                      f"predicate \"{parent.text}\"")
            print()

            input(colored("Press ENTER to continue...\n", 'green'))