Ejemplo n.º 1
0
    def rank(self, args, question, generated_queries):
        if len(generated_queries) == 0:
            return []
        if 2 > 1:
            # try:
            # Load the model
            checkpoint_filename = '%s.pt' % os.path.join(args.save, args.expname)
            dataset_vocab_file = os.path.join(args.data, 'dataset.vocab')
            # metrics = Metrics(args.num_classes)
            vocab = Vocab(filename=dataset_vocab_file,
                          data=[Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD])
            similarity = DASimilarity(args.mem_dim, args.hidden_dim, args.num_classes)
            model = SimilarityTreeLSTM(
                vocab.size(),
                args.input_dim,
                args.mem_dim,
                similarity,
                args.sparse)
            criterion = nn.KLDivLoss()
            optimizer = optim.Adagrad(model.parameters(), lr=args.lr, weight_decay=args.wd)
            emb_file = os.path.join(args.data, 'dataset_embed.pth')
            if os.path.isfile(emb_file):
                emb = torch.load(emb_file)
            model.emb.weight.data.copy_(emb)
            checkpoint = torch.load(checkpoint_filename, map_location=lambda storage, loc: storage)
            model.load_state_dict(checkpoint['model'])
            trainer = Trainer(args, model, criterion, optimizer)

            # Prepare the dataset
            json_data = [{"id": "test", "question": question,
                          "generated_queries": [{"query": " .".join(query["where"]), "correct": False} for query in
                                                generated_queries]}]
            output_dir = "./output/tmp"
            preprocess_lcquad.save_split(output_dir, *preprocess_lcquad.split(json_data, self.parser))

            if question in self.dep_tree_cache:
                preprocess_lcquad.parse(output_dir, dep_parse=False)

                cache_item = self.dep_tree_cache[question]
                with open(os.path.join(output_dir, 'a.parents'), 'w') as f_parent, open(
                        os.path.join(output_dir, 'a.toks'), 'w') as f_token:
                    for i in range(len(generated_queries)):
                        f_token.write(cache_item[0])
                        f_parent.write(cache_item[1])
            else:
                preprocess_lcquad.parse(output_dir)
                with open(os.path.join(output_dir, 'a.parents')) as f:
                    parents = f.readline()
                with open(os.path.join(output_dir, 'a.toks')) as f:
                    tokens = f.readline()
                self.dep_tree_cache[question] = [tokens, parents]

                with open(self.dep_tree_cache_file_path, 'w') as f:
                    ujson.dump(self.dep_tree_cache, f)
            test_dataset = QGDataset(output_dir, vocab, args.num_classes)

            test_loss, test_pred = trainer.test(test_dataset)
            return test_pred
Ejemplo n.º 2
0
    def rank(self, args, question, generated_queries):
        if len(generated_queries) == 0:
            return []
        # Load the model
        checkpoint_filename = '%s.pt' % os.path.join(args.save, args.expname)
        dataset_vocab_file = os.path.join(args.data, 'dataset.vocab')
        # metrics = Metrics(args.num_classes)
        vocab = Vocab(filename=dataset_vocab_file,
                      data=[
                          Constants.PAD_WORD, Constants.UNK_WORD,
                          Constants.BOS_WORD, Constants.EOS_WORD
                      ])
        similarity = DASimilarity(args.mem_dim, args.hidden_dim,
                                  args.num_classes)
        model = SimilarityTreeLSTM(vocab.size(), args.input_dim, args.mem_dim,
                                   similarity, args.sparse)
        criterion = nn.KLDivLoss()
        optimizer = optim.Adagrad(model.parameters(),
                                  lr=args.lr,
                                  weight_decay=args.wd)
        emb_file = os.path.join(args.data, 'dataset_embed.pth')
        if os.path.isfile(emb_file):
            emb = torch.load(emb_file)
        model.emb.weight.data.copy_(emb)
        checkpoint = torch.load(checkpoint_filename,
                                map_location=lambda storage, loc: storage)
        model.load_state_dict(checkpoint['model'])
        trainer = Trainer(args, model, criterion, optimizer)

        # Prepare the dataset
        json_data = [{
            "id":
            "test",
            "question":
            question,
            "generated_queries": [{
                "query": " .".join(query["where"]),
                "correct": False
            } for query in generated_queries]
        }]
        output_dir = "./output/tmp"
        preprocess_lcquad.save_split(
            output_dir, *preprocess_lcquad.split(json_data, self.parser))

        lib_dir = './learning/treelstm/lib/'
        classpath = ':'.join([
            lib_dir,
            os.path.join(lib_dir, 'stanford-parser/stanford-parser.jar'),
            os.path.join(lib_dir,
                         'stanford-parser/stanford-parser-3.5.1-models.jar')
        ])

        preprocess_lcquad.parse(output_dir, cp=classpath)
        test_dataset = QGDataset(output_dir, vocab, args.num_classes)

        test_loss, test_pred = trainer.test(test_dataset)
        return test_pred
Ejemplo n.º 3
0
    def rank(self, args, question, generated_queries):
        print('rank function:')
        print('args indim:', args.input_dim)
        print('question:', question)
        print('generated_queries', generated_queries)
        if len(generated_queries) == 0:
            return []
        if 2 > 1:
            # try:
            # Load the model
            checkpoint_filename = '%s.pt' % os.path.join(args.save, args.expname)
            dataset_vocab_file = os.path.join(args.data, 'dataset.vocab')
            # metrics = Metrics(args.num_classes)
            vocab = Vocab(filename=dataset_vocab_file,
                          data=[Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD])
            print('criou vocab')
            similarity = DASimilarity(args.mem_dim, args.hidden_dim, args.num_classes)
            model = SimilarityTreeLSTM(
                vocab.size(),
                args.input_dim,
                args.mem_dim,
                similarity,
                args.sparse)
            print(model.emb)
            criterion = nn.KLDivLoss()
            optimizer = optim.Adagrad(model.parameters(), lr=args.lr, weight_decay=args.wd)
            print('criou rede')
            emb_file = os.path.join(args.data, 'dataset_embed.pth')
            if os.path.isfile(emb_file):
                emb = torch.load(emb_file)
            print(emb.shape)
            model.emb.weight.data.copy_(emb)
            print('carregou embedding')
            checkpoint = torch.load(checkpoint_filename, map_location=lambda storage, loc: storage)
            model.load_state_dict(checkpoint['model'])
            print('carregou checkpoint')
            trainer = Trainer(args, model, criterion, optimizer)

            # Prepare the dataset
            # This part generalize the pairs of questions and queries replacing
            # entities by placeholders #ent.
            # Generates a.txt and b.txt
            json_data = [{"id": "test", "question": question,
                          "generated_queries": [{"query": " .".join(query["where"]), "correct": False} for query in
                                                generated_queries]}]
            print('json data:', json_data)
            output_dir = "./output/tmp"
            preprocess_lcquad.save_split(output_dir, *preprocess_lcquad.split(json_data, self.parser))
            print('save split')

            # This part parses both question and query generating toks, rels and
            # parents files.
            lib_dir = '/home/mateus/TCC/SQG/learning/treelstm/lib/'
            classpath = ':'.join([
                lib_dir,
                os.path.join(lib_dir, 'stanford-parser/stanford-parser.jar'),
                os.path.join(lib_dir, 'stanford-parser/stanford-parser-3.5.1-models.jar')
            ])

            if question in self.dep_tree_cache:
                preprocess_lcquad.parse(output_dir, cp=classpath, dep_parse=False)

                cache_item = self.dep_tree_cache[question]
                with open(os.path.join(output_dir, 'a.parents'), 'w') as f_parent, open(
                        os.path.join(output_dir, 'a.toks'), 'w') as f_token:
                    for i in range(len(generated_queries)):
                        f_token.write(cache_item[0])
                        f_parent.write(cache_item[1])
            else:
                print('dep_tree')
                print('classpath', classpath)
                preprocess_lcquad.parse(output_dir, cp=classpath)
                with open(os.path.join(output_dir, 'a.parents')) as f:
                    parents = f.readline()
                with open(os.path.join(output_dir, 'a.toks')) as f:
                    tokens = f.readline()
                self.dep_tree_cache[question] = [tokens, parents]

                with open(self.dep_tree_cache_file_path, 'w') as f:
                    ujson.dump(self.dep_tree_cache, f)
            test_dataset = QGDataset(output_dir, vocab, args.num_classes)

            test_loss, test_pred = trainer.test(test_dataset)
            return test_pred