Exemple #1
0
def run(model_path, test_path, config_path, output_path, batch_size):
    params_path = config_path or os.path.join(model_path, "config.json")

    params = Params.from_file(params_path)
    is_subwords = "tokenizer" in params["reader"] and params["reader"][
        "tokenizer"]["type"] == "subword"
    reader = DatasetReader.from_params(params.pop("reader"))

    device = 0 if torch.cuda.is_available() else -1
    model = Model.load(params, model_path, cuda_device=device)
    model.training = False

    predictor = Seq2SeqPredictor(model, reader)
    with open(output_path, "wt", encoding="utf-8") as w:
        for batch_number, batch in enumerate(get_batches(
                test_path, batch_size)):
            outputs = predictor.predict_batch_json(batch)
            assert len(outputs) == len(batch)
            for output in outputs:
                decoded_words = output["predicted_tokens"]
                if not decoded_words:
                    decoded_words = ["заявил"]
                if not is_subwords:
                    hyp = " ".join(decoded_words)
                else:
                    hyp = "".join(decoded_words).replace("▁", " ").replace(
                        "\n", "").strip()
                if len(hyp) <= 3:
                    hyp = "заявил"
                w.write(hyp + "\n")
Exemple #2
0
def main(config: str, model_th: str, dataset: str, hypo_file: str, ref_file: str,
         batch_size: int, no_gpu: bool):
    logger = logging.getLogger(__name__)

    logger.info("Loading configuration parameters")
    params = Params.from_file(config)

    vocab_params = params.pop("vocabulary")
    vocab = Vocabulary.from_params(vocab_params)

    reader_params = params.pop("dataset_reader")
    reader_name = reader_params.pop("type")
    # reader_params["lazy"] = True  # make sure we do not load the entire dataset

    reader = DatasetReader.by_name(reader_name).from_params(reader_params)

    logger.info("Reading data from {}".format(dataset))
    data = reader.read(dataset)

    iterator = BasicIterator(batch_size=batch_size)
    iterator.index_with(vocab)
    batches = iterator._create_batches(data, shuffle=False)

    logger.info("Loading model")
    model_params = params.pop("model")
    model_name = model_params.pop("type")
    model = Model.by_name(model_name).from_params(model_params, vocab=vocab)
    if not no_gpu:
        model.cuda(0)

    with open(model_th, 'rb') as f:
        if no_gpu:
            state_dict = torch.load(f, map_location=torch.device('cpu'))
        else:
            state_dict = torch.load(f)

    model.load_state_dict(state_dict)

    predictor = Seq2SeqPredictor(model, reader)
    model.eval()

    with open(hypo_file, 'w') as hf, open(ref_file, 'w') as rf:
        logger.info("Generating predictions")
        for sample in tqdm(batches):
            s = list(sample)
            pred = predictor.predict_batch_instance(s)

            for inst, p in zip(s, pred):
                print(
                    " ".join(p["predicted_tokens"][0]),
                    file=hf
                )
                print(
                    " ".join(t.text for t in inst["target_tokens"][1:-1]),
                    file=rf
                )
Exemple #3
0
def get_prediction(model: Model,
                   reader: DatasetReader,
                   data_path: str,
                   batch_size: int = 1024):
    model.eval()
    data = reader.read(data_path)
    predictor = Seq2SeqPredictor(model, reader)

    for ins in batch(data, batch_size):
        yield from predictor.predict_batch_instance(ins)
def main(config: str, model_th: str, dataset: str, out_file):
    logger = logging.getLogger(__name__)

    logger.info("Loading model and data")
    params = Params.from_file(config)

    vocab_params = params.pop("vocabulary")
    vocab = Vocabulary.from_params(vocab_params)

    reader_params = Params({
        "source_token_indexers": {
            "tokens": {
                "type": "single_id",
                "namespace": "tokens"
            }
        },
        "target_namespace": "tokens"
    })
    # reader_name = reader_params.pop("type")
    # reader_params["lazy"] = True  # make sure we do not load the entire dataset

    reader = UnsupervisedBTReader.from_params(reader_params)

    logger.info("Reading data from {}".format(dataset))
    data = reader.read(dataset)

    iterator = BasicIterator(batch_size=32)
    iterator.index_with(vocab)
    batches = iterator._create_batches(data, shuffle=False)

    logger.info("Loading model")
    model_params = params.pop("model")
    model_name = model_params.pop("type")
    model = Model.by_name(model_name).from_params(model_params, vocab=vocab)
    model.cuda(0)

    with open(model_th, 'rb') as f:
        model.load_state_dict(torch.load(f))

    predictor = Seq2SeqPredictor(model, reader)
    model.eval()

    line_id = 0
    writer = csv.writer(out_file, delimiter="\t")
    logger.info("Generating predictions")
    for sample in tqdm(batches):
        s = list(sample)
        pred = predictor.predict_batch_instance(s)

        for inst, p in zip(s, pred):
            writer.writerow((line_id, " ".join(
                (t.text for t in inst["source_tokens"][1:-1])),
                             " ".join(p["predicted_tokens"][0])))
            line_id += 1
Exemple #5
0
def main(config: str, model_th: str, dataset: str, seed: int):
    logger = logging.getLogger(__name__)

    logger.info("Loading model and data")
    params = Params.from_file(config)

    vocab_params = params.pop("vocabulary")
    vocab = Vocabulary.from_params(vocab_params)

    reader_params = params.pop("dataset_reader")
    reader_name = reader_params.pop("type")
    reader_params["lazy"] = True  # make sure we do not load the entire dataset
    reader = DatasetReader.by_name(reader_name).from_params(reader_params)

    data = reader.read(dataset)

    iterator = BasicIterator(batch_size=10)
    iterator.index_with(vocab)

    batches = iterator._create_batches(data, shuffle=False)

    model_params = params.pop("model")
    model_name = model_params.pop("type")
    model = Model.by_name(model_name).from_params(model_params, vocab=vocab)
    # model.cuda(cuda_device)

    with open(model_th, 'rb') as f:
        model.load_state_dict(torch.load(f))

    predictor = Seq2SeqPredictor(model, reader)
    model.eval()

    logger.info("Generating predictions")

    random.seed(seed)
    samples = []
    for b in batches:
        samples.append(b)
        if random.random() > 0.6:
            break

    sample = list(random.choice(samples))
    pred = predictor.predict_batch_instance(sample)

    for inst, p in zip(sample, pred):
        print()
        print("SOURCE:", " ".join([t.text for t in inst["source_tokens"]]))
        print("GOLD:", " ".join([t.text for t in inst["target_tokens"]]))
        print("GEN:", p["predicted_tokens"])
    def _test_model(self, file_name):
        params = self.params[file_name].duplicate()
        reader_params = params.duplicate().pop("reader", default=Params({}))
        if reader_params["type"] == "cnn_dailymail":
            reader_params["cnn_tokenized_dir"] = TEST_STORIES_DIR
            dataset_file = TEST_URLS_FILE
        elif reader_params["type"] == "ria":
            dataset_file = RIA_EXAMPLE_FILE
        else:
            assert False

        reader = DatasetReader.from_params(reader_params)
        tokenizer = reader._tokenizer
        dataset = reader.read(dataset_file)
        vocabulary_params = params.pop("vocabulary", default=Params({}))
        vocabulary = Vocabulary.from_params(vocabulary_params,
                                            instances=dataset)

        model_params = params.pop("model")
        model = Model.from_params(model_params, vocab=vocabulary)
        print(model)
        print("Trainable params count: ",
              sum(p.numel() for p in model.parameters() if p.requires_grad))

        iterator = DataIterator.from_params(params.pop('iterator'))
        iterator.index_with(vocabulary)
        trainer = Trainer.from_params(model, None, iterator, dataset, None,
                                      params.pop('trainer'))
        trainer.train()

        model.eval()
        predictor = Seq2SeqPredictor(model, reader)
        for article, reference_sents in reader.parse_set(dataset_file):
            ref_words = [
                token.text for token in tokenizer.tokenize(reference_sents)
            ]
            decoded_words = predictor.predict(article)["predicted_tokens"]
            self.assertGreaterEqual(len(decoded_words), len(ref_words))
            unk_count = 0
            while DEFAULT_OOV_TOKEN in decoded_words:
                unk_index = decoded_words.index(DEFAULT_OOV_TOKEN)
                decoded_words.pop(unk_index)
                unk_count += 1
                if unk_index < len(ref_words):
                    ref_words.pop(unk_index)
            self.assertLess(unk_count, 5)
            self.assertListEqual(decoded_words[:len(ref_words)], ref_words)
Exemple #7
0
def get_model_runner(model_path, reader, model_config_path=None):
    config_path = model_config_path or os.path.join(model_path, "config.json")
    params = Params.from_file(config_path)
    device = 0 if torch.cuda.is_available() else -1
    model = Model.load(params, model_path, cuda_device=device)
    model.eval()
    predictor = Seq2SeqPredictor(model, reader)
    print(model)
    print("Trainable params count: ", sum(p.numel() for p in model.parameters() if p.requires_grad))

    def run_model(batch):
        outputs = predictor.predict_batch_json(batch)
        targets = [b.get('target') for b in batch]
        hyps = []
        for output in outputs:
            decoded_words = output["predicted_tokens"]
            hyp = " ".join(decoded_words).strip()
            hyps.append(hyp)
        return targets, hyps
    return run_model
Exemple #8
0
def evaluate(model_path, test_path, config_path, metric, is_multiple_ref,
             max_count, report_every, batch_size):
    params_path = config_path or os.path.join(model_path, "config.json")

    params = Params.from_file(params_path)
    is_subwords = "tokenizer" in params["reader"] and params["reader"][
        "tokenizer"]["type"] == "subword"
    reader = DatasetReader.from_params(params.pop("reader"))

    device = 0 if torch.cuda.is_available() else -1
    model = Model.load(params, model_path, cuda_device=device)
    model.training = False
    print(model)
    print("Trainable params count: ",
          sum(p.numel() for p in model.parameters() if p.requires_grad))

    hyps = []
    refs = []
    predictor = Seq2SeqPredictor(model, reader)
    for batch in get_batches(reader, test_path, batch_size):
        outputs = predictor.predict_batch_json(batch)
        targets = [b.get('target') for b in batch]
        for output, target in zip(outputs, targets):
            decoded_words = output["predicted_tokens"]
            if not is_multiple_ref:
                hyp = detokenize(
                    " ".join(decoded_words)) if not is_subwords else "".join(
                        decoded_words).replace("▁", " ")
                if len(hyp.strip()) <= 1:
                    hyp = "empty"
                    print("Empty hyp")
                if len(target.strip()) <= 1:
                    target = "empty"
                    print("Empty target")
                ref = [target]
            else:
                if isinstance(target, list):
                    reference_sents = target
                elif isinstance(target, str):
                    reference_sents = target.split(" s_s ")
                else:
                    assert False
                decoded_sents = (" ".join(decoded_words)).split("s_s")
                hyp = [
                    w.replace("<", "&lt;").replace(">", "&gt;").strip()
                    for w in decoded_sents
                ]
                ref = [
                    w.replace("<", "&lt;").replace(">", "&gt;").strip()
                    for w in reference_sents
                ]
                hyp = " ".join(hyp)
                ref = [" ".join(ref)]

            hyps.append(hyp)
            refs.append(ref)

            if len(hyps) % report_every == 0:
                print("Count: ", len(hyps))
                print("Ref: ", ref)
                print("Hyp: ", hyp)

                if metric in ("bleu", "all"):
                    from nltk.translate.bleu_score import corpus_bleu
                    print("BLEU: ", corpus_bleu(refs, hyps))

                if metric in ("rouge", "all"):
                    rouge = Rouge()
                    scores = rouge.get_scores(hyps, [r[0] for r in refs],
                                              avg=True)
                    print("ROUGE: ", scores)

            if max_count and len(hyps) >= max_count:
                break