def _load_reader_and_predictor(basedir): config_path = os.path.join(basedir, 'config.json') config = Params.from_file(config_path) model = Model.load(config=config, serialization_dir=basedir) reader = DatasetReader.from_params(config.get('dataset_reader')) predictor = Predictor(model=model, dataset_reader=reader) return reader, predictor
def load_predictor(serialization_dir, device): ## Load the model archive = load_archive(join(serialization_dir, 'model.tar.gz')) model = archive.model.eval() if device >= 0: model.to(0) ## Load the dataset reader dataset_reader_params = archive.config.pop('dataset_reader') model_name = archive.config.pop('model')['type'] # Turn off truncation of the inputs if model_name == 'gnli': pass # dataset_reader_params.params['max_premise_length'] = None # dataset_reader_params.params['max_hypothesis_length'] = None elif model_name == 'bertnli': dataset_reader_params.params['max_seq_length'] = None else: raise ValueError() reader = DatasetReader.by_name(dataset_reader_params.pop('type')).from_params(dataset_reader_params) predictor = Predictor(model, reader) return predictor
def main(): parser = ArgumentParser() parser.add_argument('--lang', action='store') parser.add_argument('--config', action='store', default='configs/HAN.jsonnet') parser.add_argument('--save', action='store', default='experiments/models/HAN') parser.add_argument('--dataset', default="conllu.tar.gz") args = parser.parse_args() import_submodules("loader") import_submodules("models") test_path = "data/{0}/{1}/test".format(args.lang, args.dataset) model = load_archive(os.path.join(args.save, "model.tar.gz")).model if "HAN" in args.config or "hier" in args.config: reader = NorecReaderHierarchical() else: reader = NorecReader_Flat() p = Predictor(model, reader) header = "doc_id\tgold\tpred\tnum_sents\tnum_tokens\n" predictions = [] gold_labels = [] output_text = header for i in reader.read(test_path): metadata = i.fields['meta'].metadata try: pred = p.predict_instance(i)['prediction'] # if there's an error always choose 1 except: pred = 1 predictions.append(pred) gold_label = i["rating"].label gold_labels.append(gold_label) output_text += "{}\t{}\t{}\t{}\t{}\n".format(metadata["doc_id"], gold_label, pred, metadata["sentences"], metadata["tokens"]) acc = accuracy_score(gold_labels, predictions) f1 = f1_score(gold_labels, predictions, average="macro") print("Acc score: {0:.3f}\nF1 score: {1:.3f}\n".format(acc, f1)) final_output = "Acc score: {0:.3f}\nF1 score: {1:.3f}\n\n".format(acc, f1) final_output += output_text with open(os.path.join(args.save, "results.txt"), "w") as outfile: outfile.write(final_output)
def main(input_file, archive_file, batch_size, cuda_device): model, config = load_archive(archive_file=archive_file, cuda_device=cuda_device) model.eval() dataset_reader = DatasetReader.from_params(config["dataset_reader"]) dataset = dataset_reader.read(input_file) predictor = Predictor(model, dataset_reader) evaluator = Evaluator() with tqdm(desc="Decoding...") as p: for ins in batch(dataset, batch_size): for result in predictor.predict_batch_instance(ins): print(json.dumps(result)) evaluator(result) p.update() print(evaluator.get_metrics(reset=True), file=sys.stderr)
''' the language model used Glove but we just build an embedder to load the trained parameters ''' token_embedding = Embedding( num_embeddings=vocabulary.get_vocab_size(namespace='tokens'), embedding_dim=combination.word_embedding_size, padding_index=0) token_embedder: TextFieldEmbedder = BasicTextFieldEmbedder( {'tokens': token_embedding}) ''' define encoder to wrap up an lstm feature extractor ''' contextualizer: Seq2SeqEncoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(input_size=combination.word_embedding_size, hidden_size=combination.ed_ncoder_size, bidirectional=False, batch_first=True)) model = LanguageModel(vocab=vocabulary, text_field_embedder=token_embedder, contextualizer=contextualizer, dropout=combination.dropout, regularizer=RegularizerApplicator([('l2', L2Regularizer(alpha=combination.l2))]), ) \ .cuda(device) model.load_state_dict(torch.load(open(language_model_path, 'rb')), strict=True) dataset_reader = LanguageModelSegmentReader(global_constants=GLOBAL_CONSTANTS) language_model_predictor = Predictor(model=model, dataset_reader=dataset_reader) val_data_path = os.path.join('.', 'data_seg_val_toytoy') instances = dataset_reader.read(val_data_path) predictions = [ language_model_predictor.predict_instance(instance) for instance in instances ]