Esempio n. 1
0
 def _load_reader_and_predictor(basedir):
     config_path = os.path.join(basedir, 'config.json')
     config = Params.from_file(config_path)
     model = Model.load(config=config, serialization_dir=basedir)
     reader = DatasetReader.from_params(config.get('dataset_reader'))
     predictor = Predictor(model=model, dataset_reader=reader)
     return reader, predictor
def load_predictor(serialization_dir, device):
	## Load the model
	archive = load_archive(join(serialization_dir, 'model.tar.gz'))
	model = archive.model.eval()
	if device >= 0: 
		model.to(0)

	## Load the dataset reader
	dataset_reader_params = archive.config.pop('dataset_reader')
	model_name = archive.config.pop('model')['type']

	# Turn off truncation of the inputs
	if model_name == 'gnli':
		pass
		# dataset_reader_params.params['max_premise_length'] = None
		# dataset_reader_params.params['max_hypothesis_length'] = None
	elif model_name == 'bertnli':
		dataset_reader_params.params['max_seq_length'] = None
	else:
		raise ValueError()

	reader = DatasetReader.by_name(dataset_reader_params.pop('type')).from_params(dataset_reader_params)

	predictor = Predictor(model, reader)
	return predictor
Esempio n. 3
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--lang', action='store')
    parser.add_argument('--config', action='store', default='configs/HAN.jsonnet')
    parser.add_argument('--save', action='store', default='experiments/models/HAN')
    parser.add_argument('--dataset', default="conllu.tar.gz")
    args = parser.parse_args()

    import_submodules("loader")
    import_submodules("models")


    test_path = "data/{0}/{1}/test".format(args.lang, args.dataset)

    model = load_archive(os.path.join(args.save, "model.tar.gz")).model
    if "HAN" in args.config or "hier" in args.config:
        reader = NorecReaderHierarchical()
    else:
        reader = NorecReader_Flat()

    p = Predictor(model, reader)

    header = "doc_id\tgold\tpred\tnum_sents\tnum_tokens\n"

    predictions = []
    gold_labels = []
    output_text = header

    for i in reader.read(test_path):
        metadata = i.fields['meta'].metadata

        try:
            pred = p.predict_instance(i)['prediction']
        # if there's an error always choose 1
        except:
            pred = 1
        predictions.append(pred)

        gold_label = i["rating"].label
        gold_labels.append(gold_label)

        output_text += "{}\t{}\t{}\t{}\t{}\n".format(metadata["doc_id"],
                                                     gold_label,
                                                     pred,
                                                     metadata["sentences"],
                                                     metadata["tokens"])


    acc = accuracy_score(gold_labels, predictions)
    f1 = f1_score(gold_labels, predictions, average="macro")

    print("Acc score: {0:.3f}\nF1 score: {1:.3f}\n".format(acc, f1))


    final_output = "Acc score: {0:.3f}\nF1 score: {1:.3f}\n\n".format(acc, f1)
    final_output += output_text

    with open(os.path.join(args.save, "results.txt"), "w") as outfile:
        outfile.write(final_output)
Esempio n. 4
0
def main(input_file, archive_file, batch_size, cuda_device):
    model, config = load_archive(archive_file=archive_file,
                                 cuda_device=cuda_device)
    model.eval()

    dataset_reader = DatasetReader.from_params(config["dataset_reader"])
    dataset = dataset_reader.read(input_file)
    predictor = Predictor(model, dataset_reader)
    evaluator = Evaluator()

    with tqdm(desc="Decoding...") as p:
        for ins in batch(dataset, batch_size):
            for result in predictor.predict_batch_instance(ins):
                print(json.dumps(result))
                evaluator(result)
                p.update()
    print(evaluator.get_metrics(reset=True), file=sys.stderr)
Esempio n. 5
0
''' the language model used Glove but we just build an embedder to load the trained parameters '''
token_embedding = Embedding(
    num_embeddings=vocabulary.get_vocab_size(namespace='tokens'),
    embedding_dim=combination.word_embedding_size,
    padding_index=0)
token_embedder: TextFieldEmbedder = BasicTextFieldEmbedder(
    {'tokens': token_embedding})
''' define encoder to wrap up an lstm feature extractor '''
contextualizer: Seq2SeqEncoder = PytorchSeq2SeqWrapper(
    torch.nn.LSTM(input_size=combination.word_embedding_size,
                  hidden_size=combination.ed_ncoder_size,
                  bidirectional=False,
                  batch_first=True))
model = LanguageModel(vocab=vocabulary,
                      text_field_embedder=token_embedder,
                      contextualizer=contextualizer,
                      dropout=combination.dropout,
                      regularizer=RegularizerApplicator([('l2', L2Regularizer(alpha=combination.l2))]),
                      ) \
    .cuda(device)
model.load_state_dict(torch.load(open(language_model_path, 'rb')), strict=True)
dataset_reader = LanguageModelSegmentReader(global_constants=GLOBAL_CONSTANTS)
language_model_predictor = Predictor(model=model,
                                     dataset_reader=dataset_reader)
val_data_path = os.path.join('.', 'data_seg_val_toytoy')
instances = dataset_reader.read(val_data_path)
predictions = [
    language_model_predictor.predict_instance(instance)
    for instance in instances
]