Exemple #1
0
    args = parser.parse_args()
    print(args)
    globals().update(args.__dict__)
    random.seed(seed)
    np.random.seed(seed)
    tf.set_random_seed(seed)

    logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)),
                          **args.__dict__)
    text_encoder = TextEncoder(encoder_path, bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    (trX1, trX2, trX3,
     trY), (vaX1, vaX2, vaX3,
            vaY), (teX1, teX2, teX3) = encode_dataset(rocstories(data_dir),
                                                      encoder=text_encoder)
    n_y = 2
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    max_len = n_ctx // 2 - 2
    n_ctx = min(
        max([
            len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len]))
            for x1, x2, x3 in zip(trX1, trX2, trX3)
        ] + [
            len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len]))
            for x1, x2, x3 in zip(vaX1, vaX2, vaX3)
Exemple #2
0
    logger = ResultLogger(path=os.path.join(
        log_dir, '{}.jsonl'.format(desc)), **args.__dict__)
    text_encoder = TextEncoder(encoder_path, bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    max_len = n_ctx // 2 - 2

    if dataset == 'rocstories':
        (trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3) = encode_dataset(
            rocstories(data_dir, n_valid=n_valid), encoder=text_encoder)
        n_y = 2
        n_ctx = min(max(
            [len(x1[:max_len]) + max(len(x2[:max_len]),
                                     len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3)]
            + [len(x1[:max_len]) + max(len(x2[:max_len]),
                                       len(x3[:max_len])) for x1, x2, x3 in zip(vaX1, vaX2, vaX3)]
            + [len(x1[:max_len]) + max(len(x2[:max_len]),
                                       len(x3[:max_len])) for x1, x2, x3 in zip(teX1, teX2, teX3)]
        ) + 3,
            n_ctx)
        vocab = n_vocab + n_special + n_ctx
        trX, trM = transform_roc(trX1, trX2, trX3)
        vaX, vaM = transform_roc(vaX1, vaX2, vaX3)
        if submit:
            teX, teM = transform_roc(teX1, teX2, teX3)
Exemple #3
0
    text_encoder = TextEncoder(encoder_path, bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    max_len = n_ctx // 2 - 2

    if dataset == 'rocstories':
        (trX1, trX2, trX3,
         trY), (vaX1, vaX2, vaX3,
                vaY), (teX1, teX2,
                       teX3) = encode_dataset(rocstories(data_dir,
                                                         n_valid=n_valid),
                                              encoder=text_encoder)
        n_y = 2
        n_ctx = min(
            max([
                len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len]))
                for x1, x2, x3 in zip(trX1, trX2, trX3)
            ] + [
                len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len]))
                for x1, x2, x3 in zip(vaX1, vaX2, vaX3)
            ] + [
                len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len]))
                for x1, x2, x3 in zip(teX1, teX2, teX3)
            ]) + 3, n_ctx)
        vocab = n_vocab + n_special + n_ctx
        trX, trM = transform_roc(trX1, trX2, trX3)
Exemple #4
0
    submission_dir = args.submission_dir

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device {} n_gpu {}".format(device, n_gpu))

    res_logger = ResultLogger(path=os.path.join(log_dir,
                                                '{}.jsonl'.format(desc)),
                              **args.__dict__)
    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    logger.info("Encoding dataset...")
    ((trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY),
     (teX1, teX2, teX3)) = encode_dataset(*rocstories(data_dir,
                                                      n_valid=args.n_valid),
                                          encoder=text_encoder)
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    max_len = n_ctx // 2 - 2
    n_ctx = min(
        max([
            len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len]))
            for x1, x2, x3 in zip(trX1, trX2, trX3)
        ] + [
            len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len]))
            for x1, x2, x3 in zip(vaX1, vaX2, vaX3)
        ] + [
Exemple #5
0
    parser.add_argument('--e', type=float, default=1e-8)

    args = parser.parse_args()
    print(args)
    globals().update(args.__dict__)
    random.seed(seed)
    np.random.seed(seed)
    tf.set_random_seed(seed)
    #tf.random.set_seed(seed)

    logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__)
    text_encoder = TextEncoder(encoder_path, bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    (trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3) = encode_dataset(rocstories(data_dir), encoder=text_encoder)
    n_y = 2
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    max_len = n_ctx//2-2
    n_ctx = min(max([len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3)]+[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(vaX1, vaX2, vaX3)]+[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(teX1, teX2, teX3)])+3, n_ctx)
    trX, trM = transform_roc(trX1, trX2, trX3)
    vaX, vaM = transform_roc(vaX1, vaX2, vaX3)
    if submit:
        teX, teM = transform_roc(teX1, teX2, teX3)

    n_train = len(trY)
    n_valid = len(vaY)
    n_ctx = args.n_ctx
    save_dir = args.save_dir
    desc = args.desc
    data_dir = args.data_dir
    log_dir = args.log_dir

    # torch.device object used throughout this script TODO add gpu setting
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__)
    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    (trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3) = encode_dataset(
        rocstories(data_dir, n_valid=args.n_valid), encoder=text_encoder)
    n_y = 2
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    max_len = n_ctx // 2 - 2
    n_ctx = min(max(
        [len(x1[:max_len]) + max(len(x2[:max_len]),
                                 len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3)]
        + [len(x1[:max_len]) + max(len(x2[:max_len]),
                                   len(x3[:max_len])) for x1, x2, x3 in zip(vaX1, vaX2, vaX3)]
        + [len(x1[:max_len]) + max(len(x2[:max_len]),
                                   len(x3[:max_len])) for x1, x2, x3 in zip(teX1, teX2, teX3)]
    ) + 3, n_ctx)
Exemple #7
0
    [references]
    globals().update: https://stackoverflow.com/questions/1589968/python-difference-between-global-globals-updatevar
    __dict__: http://coolpythontips.blogspot.com/2015/12/dict.html
    """
    globals().update(args.__dict__)
    random.seed(seed)
    np.random.seed(seed)
    tf.set_random_seed(seed)

    logger = ResultLogger(path=os.path.join(log_dir, "{}.json".format(desc)), **args.__dict__)
    text_encoder = TextEncoder(encoder_path, bpe_path)
    encoder =text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    (trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3) = encode_dataset(
                                                                            rocstories(data_dir),
                                                                            encoder=text_encoder)
    n_y = 2
    encoder["_start_"] = len(encoder)
    encoder["_delimiter_"] = len(encoder)
    encoder["_classify_"] = len(encoder)
    clf_token = encoder["_classify_"]
    n_special = 3
    max_len = n_ctx // 2 - 2
    """
    set the context length from the longest sequence from train, validation
    and test datasets + 3(special tokens used in finetuning)
    or the context length which is originally set
    """
    n_ctx = min(max([len(x1[:max_len]) + max(len(x2[:max_len]),
                                             len(x3[:max_len]))
Exemple #8
0
    args = parser.parse_args()
    print(args)
    globals().update(args.__dict__)
    random.seed(seed)
    np.random.seed(seed)
    tf.set_random_seed(seed)

    logger = ResultLogger(path=os.path.join(
        log_dir, '{}.jsonl'.format(desc)), **args.__dict__)
    text_encoder = TextEncoder(encoder_path, bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    (trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2,
                                                       teX3) = encode_dataset(rocstories(data_dir), encoder=text_encoder)
    n_y = 2
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    max_len = n_ctx//2-2
    n_ctx = min(max([len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3)]+[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len]))
                                                                                                                           for x1, x2, x3 in zip(vaX1, vaX2, vaX3)]+[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(teX1, teX2, teX3)])+3, n_ctx)
    trX, trM = transform_roc(trX1, trX2, trX3)
    vaX, vaM = transform_roc(vaX1, vaX2, vaX3)
    if submit:
        teX, teM = transform_roc(teX1, teX2, teX3)

    n_train = len(trY)
Exemple #9
0
    submission_dir = args.submission_dir

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)

    logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)),
                          **args.__dict__)
    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    print("Encoding dataset...")
    ((trX1, trX2, trX3, trY, trELMo), (vaX1, vaX2, vaX3, vaY, vaELMo),
     (teX1, teX2, teX3,
      teELMo)) = encode_dataset(*rocstories(data_dir, n_valid=args.n_valid),
                                encoder=text_encoder)
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    max_len = n_ctx // 2 - 2
    n_ctx = min(
        max([
            len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len]))
            for x1, x2, x3 in zip(trX1, trX2, trX3)
        ] + [
            len(x1[:max_len]) + max(len(x2[:max_len]), len(x3[:max_len]))
            for x1, x2, x3 in zip(vaX1, vaX2, vaX3)
        ] + [