def create_seq_tag_dataset(samples, save_path, config, padding=0):
    dataset = SeqTaggingDataset(
        samples, padding=padding,
        max_text_len=config.get('max_text_len') or 300,
    )
    with open(save_path, 'wb') as f:
        pickle.dump(dataset, f)
def create_seq_tag_dataset(samples, save_path, padding=0):
    dataset = SeqTaggingDataset(
        samples, padding=padding,
        max_text_len=300
    )
    with open(save_path, 'wb') as f:
        pickle.dump(dataset, f)
            'usage: python3 plotingRelative.py model.pt embedding.pkl TestingData.pkl'
        )
        exit(0)

    modelName = sys.argv[1]
    embeddingName = sys.argv[2]
    testDataName = sys.argv[3]
    # sys.argv[1] embedding (TrainingData.npy)
    # sys.argv[2] TestX (TestingData.npy)
    # sys.argv[3] TestY predict result (predict.jsonl)

    with open(testDataName, "rb") as FileTesting:
        #print(sys.argv[1])
        testingData = pickle.load(FileTesting)

    testingData = SeqTaggingDataset(testingData)

    BATCH_SIZE = 40
    loader = Data.DataLoader(
        dataset=testingData,  # torch TensorDataset format
        batch_size=BATCH_SIZE,  # mini batch size
        shuffle=False,  # 要不要打乱数据 (打乱比较好)
        num_workers=2,  # 多线程来读数据
        collate_fn=testingData.collate_fn)

    with open(embeddingName, 'rb') as f:
        embedding = pickle.load(f)
    model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM1, HIDDEN_DIM2,
                       len(embedding.vocab), 1, embedding.vectors)  # yes/no 2
    #model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(embedding.vocab), 1, 'none') # yes/no 2
    model.load_state_dict(torch.load(modelName))
    embeddingName = sys.argv[3]
    modelName = sys.argv[4]

    with open(trainingName, "rb") as FileTraining:
        #print(sys.argv[1])
        trainingData = pickle.load(FileTraining)

    with open(validName, "rb") as FileValidating:
        #print(sys.argv[1])
        validData = pickle.load(FileValidating)

    with open("data/valid.jsonl", "r") as f:
        answers = [json.loads(line) for line in f]
        answers = {a['id']: a for a in answers}

    trainingData = SeqTaggingDataset(trainingData)
    validData = SeqTaggingDataset(validData)
    BATCH_SIZE = 32
    BATCH_Valid = 20
    EPOCH = 20
    stEPOCH = 1

    loader = Data.DataLoader(
        dataset=trainingData,  # torch TensorDataset format
        batch_size=BATCH_SIZE,  # mini batch size
        shuffle=True,  # 要不要打乱数据 (打乱比较好)
        num_workers=2,  # 多线程来读数据
        collate_fn=trainingData.collate_fn)
    loader_valid = Data.DataLoader(
        dataset=validData,  # torch TensorDataset format
        batch_size=BATCH_Valid,  # mini batch size
seq_tagger = SeqTagger(hparams)
seq_tagger = seq_tagger.load_from_checkpoint("./seq_tag/seq_tag.ckpt")
trainer = pl.Trainer()
trainer.test(seq_tagger)

import matplotlib.pyplot as plt
import math



with open("./seq_tag/test.pkl", "rb") as f:
    test = pickle.load(f)


test_dataset = SeqTaggingDataset(
        test, padding=0,
        max_text_len=300,)


sent_range = [d['sent_range'] for d in test_dataset.data]
distribution = []
final_ans = []
for p, i, r in zip(predict, id, sent_range):
    sent_probs = np.array([np.array(p[start:end]).mean() for start, end in r])
    #print(sent_probs)
    final_ans.append({'id': i, 'predict_sentence_index': [int(x) for x in sent_probs.argsort()[::-1][0:2]]})
    all_probs.append([sent_probs[x] for x in sent_probs.argsort()[::-1][0:2]])
    distribution.append([float(x/len(r)) for x in sent_probs.argsort()[::-1][0:2]])
Path(args.output_path).write_text('\n'.join([json.dumps(ans) for ans in final_ans])+'\n')
distribution = np.array(distribution)
List = []