def main():
    if len(sys.argv) != 5:
        raise InputError(
            "Usage: python write_tfrecords_test.py args_test_json_path "
            "test_json_dir tfrecord_dir vocab_dir")

    # TODO REFACTOR!!!

    args_test_path = sys.argv[1]
    json_dir = sys.argv[2]
    tfrecord_dir = sys.argv[3]
    vocab_dir = sys.argv[4]

    # find the used arguments
    if os.path.exists(os.path.join(os.path.abspath(vocab_dir), 'args.json')):
        args_path = os.path.join(os.path.abspath(vocab_dir), 'args.json')
    else:
        args_path = os.path.join(vocab_dir,
                                 os.listdir(vocab_dir)[0], 'args.json')

    with open(args_path) as file:
        args_used = json.load(file)

    args = load_json(args_path)

    dataset = Dataset(

        # keep consistent with the training datasets
        max_document_length=args_used['max_document_length'],
        max_vocab_size=args_used['max_vocab_size_allowed'],
        min_frequency=args_used['min_frequency'],
        max_frequency=args_used['max_frequency'],
        padding=args_used.get('padding', args['padding']),
        write_bow=args_used.get('write_bow', args['write_bow']),
        write_tfidf=args_used.get('write_tfidf', args['write_tfidf']),
        tokenizer_=args_used.get('tokenizer', args['tokenizer']),
        stemmer=args_used.get('stemmer', args['stemmer']),
        stopwords=args_used.get('stopwords', args['stopwords']),
        preproc=args_used.get('preproc', args.get('preproc', True)),
        vocab_all=args_used.get('vocab_all', args.get('vocab_all', False)),

        # may be different
        text_field_names=args['text_field_names'],
        label_field_name=args['label_field_name'],
        label_type=args.get('label_type', 'int'),

        # test split only
        train_ratio=0.0,
        valid_ratio=0.0,

        # default in test mode
        json_dir=json_dir,
        tfrecord_dir=tfrecord_dir,
        vocab_dir=vocab_dir,
        generate_basic_vocab=False,
        vocab_given=True,
        vocab_name='vocab_v2i.json',
        generate_tf_record=True)
Example #2
0
def main():
    if len(sys.argv) < NUM_ARGS + 1:
        print('Usage: python qsub_stl_jobs.py '
              '<hyperparameter-config-file> <encoder-config-file>')
        sys.exit(1)

    config_file = sys.argv[1]
    config = None
    with open(config_file) as f:
        # allow for '//' comments
        lines = [line for line in f if not line.lstrip().startswith('/')]
        # print(' '.join(lines))
        config = json.loads(' '.join(lines))
    encoder_config_file = sys.argv[2]
    encoder_config = None
    with open(encoder_config_file) as f:
        # allow for '//' comments
        lines = [line for line in f if not line.lstrip().startswith('/')]
        encoder_config = json.loads(' '.join(lines))

    # print(encoder_config)

    config = load_json(sys.argv[1])

    # remove unnecesary arguments

    finetune_specific_fields = ['checkpoint_dir_init']

    if config['mode'] != 'finetune':
        for i in finetune_specific_fields:
            if i in config:
                config.pop(i)

    # No comments in encod config file generated by get_qsub_encod.py
    encoder_config = json.load(open(sys.argv[2]))[config['architecture']]

    meta_config = MetaConfig(config)
    # config now contains only experimental hyperparameters
    # (i.e., no "meta" parameters [venv, slots_per_job, etc.])
    run_all_experiments(meta_config, config, encoder_config)
Example #3
0
def main():
    if len(sys.argv) != 5:
        raise InputError(
            "Usage: python write_tfrecords_finetune.py dataset_name "
            "args_finetune_json_path finetune_json_dir vocab_dir")

    dataset_name = sys.argv[1]
    args_finetune_path = sys.argv[2]
    json_dir = sys.argv[3]
    vocab_dir = sys.argv[4]

    # find the used arguments
    if os.path.exists(os.path.join(os.path.abspath(vocab_dir), 'args.json')):
        args_path = os.path.join(os.path.abspath(vocab_dir), 'args.json')
    else:
        args_path = os.path.join(vocab_dir, os.listdir(vocab_dir)[0],
                                 'args.json')

    with open(args_path) as file:
        args_used = json.load(file)

    args = load_json(args_finetune_path)

    tfrecord_dir = os.path.join("data/tf/single/", dataset_name)
    # tfrecord_dir_name = \
    #   "min_" + str(args['min_frequency']) + \
    #   "_max_" + str(args['max_frequency']) + \
    #   "_vocab_" + str(args['max_vocab_size']) + \
    #   "_doc_" + str(args['max_document_length']) + \
    #   "_tok_" + args['tokenizer'].replace('_tokenizer', '')
    # tfrecord_dir = os.path.join(tfrecord_dir, tfrecord_dir_name)
    tfrecord_dir_name = os.path.basename(vocab_dir)
    tfrecord_dir = os.path.join(tfrecord_dir, tfrecord_dir_name)

    dataset = Dataset(

        # TODO keep consistent with the training datasets?
        max_document_length=args_used['max_document_length'],
        max_vocab_size=args_used['max_vocab_size_allowed'],
        min_frequency=args_used['min_frequency'],
        max_frequency=args_used['max_frequency'],
        # padding=args_used.get('padding', args['padding']),
        # write_bow=args_used.get('write_bow', args['write_bow']),
        # write_tfidf=args_used.get('write_tfidf', args['write_tfidf']),
        # tokenizer_=args_used.get('tokenizer', args['tokenizer']),
        # preproc=args_used.get('preproc', args.get('preproc', True)),
        # vocab_all=args_used.get('vocab_all', args.get('vocab_all', False)),
        padding=args_used['padding'],
        write_bow=args_used['write_bow'],
        write_tfidf=args_used['write_tfidf'],
        tokenizer_=args_used['tokenizer_'],
        stemmer=args_used['stemmer'],
        stopwords=args_used['stopwords'],
        preproc=args_used['preproc'],
        vocab_all=args_used['vocab_all'],

        # may be different
        text_field_names=args['text_field_names'],
        label_field_name=args['label_field_name'],
        label_type=args.get('label_type', 'int'),

        train_ratio=args['train_ratio'],
        valid_ratio=args['train_ratio'],

        # default in finetune mode
        json_dir=json_dir,
        tfrecord_dir=tfrecord_dir,
        vocab_dir=vocab_dir,
        generate_basic_vocab=False,
        vocab_given=True,
        vocab_name='vocab_v2i.json',
        generate_tf_record=True
    )
def main():
    if len(sys.argv) != 5:
        raise InputError(
            "Usage: python write_tfrecords_predict.py dataset_args_path "
            "predict_json_path predict_tf_path vocab_dir")

    dataset_args_path = sys.argv[1]
    predict_json_path = sys.argv[2]
    predict_tf_path = sys.argv[3]
    vocab_dir = sys.argv[4]

    # find the used arguments
    if os.path.exists(os.path.join(os.path.abspath(vocab_dir), 'args.json')):
        args_path = os.path.join(os.path.abspath(vocab_dir), 'args.json')
    else:
        args_path = os.path.join(
            vocab_dir, os.listdir(vocab_dir)[0], 'args.json')

    with open(args_path) as file:
        args_used = json.load(file)

    if not os.path.exists(os.path.dirname(predict_tf_path)):
        make_dir(os.path.dirname(predict_tf_path))

    # args_DATASET.json or args_merged.json which has min_freq, max_freq,
    # max_document_length etc. information, which are used to further build
    # vocabulary

    args = load_json(dataset_args_path)
    print(args)

    dataset = Dataset(

        # keep consistent with the training datasets
        max_document_length=args_used['max_document_length'],
        max_vocab_size=args_used['max_vocab_size_allowed'],
        min_frequency=args_used['min_frequency'],
        max_frequency=args_used['max_frequency'],
        # padding=args_used.get('padding', args.get('padding', False)),
        # write_bow=args_used.get('write_bow', args.get('write_bow', False)),
        # write_tfidf=args_used.get('write_tfidf', args.get('write_tfidf', False)),
        # tokenizer_=args_used.get('tokenizer', args['tokenizer']),
        # preproc=args_used.get('preproc', args.get('preproc', True)),
        # vocab_all=args_used.get('vocab_all', args.get('vocab_all', False)),

        # use new arguments
        padding=args.get('padding', args_used.get('padding', False)),
        write_bow=args.get('write_bow', args_used.get('write_bow', False)),
        write_tfidf=args.get('write_tfidf',
                             args_used.get('write_tfidf', False)),
        tokenizer_=args.get('tokenizer', args_used.get('tokenizer_',
                                                       'lower_tokenizer')),
        stemmer=args.get('stemmer', args_used.get('stemmer', 'porter_stemmer')),
        stopword=args.get('stopwords', args_used.get('stopwords', 'nltk')),
        preproc=args.get('preproc', args_used.get('preproc', True)),
        vocab_all=args.get('vocab_all', args_used.get('vocab_all', False)),

        # may be different
        text_field_names=args['text_field_names'],
        label_field_name=args['label_field_name'],
        label_type=args.get('label_type', 'int'),

        # default in predict mode
        json_dir=None,
        tfrecord_dir=None,
        vocab_dir=vocab_dir,
        generate_basic_vocab=False,
        vocab_given=True,
        vocab_name='vocab_v2i.json',
        generate_tf_record=True,
        predict_mode=True,
        predict_json_path=predict_json_path,
        predict_tf_path=predict_tf_path
    )
def main():
    if len(sys.argv) == 2:
        args_name = 'args_' + sys.argv[1] + '.json'
    else:
        args_name = sys.argv[2]

    args = load_json(args_name)

    json_dir = "data/json/" + sys.argv[1]

    tfrecord_dir = os.path.join("data/tf/single/", sys.argv[1])

    preproc = True
    if 'preproc' in args:
        preproc = args['preproc']

    vocab_all = False
    if 'vocab_all' in args:
        vocab_all = args['vocab_all']

    tfrecord_dir_name = \
        "min_" + str(args['min_frequency']) + \
        "_max_" + str(args['max_frequency']) + \
        "_vocab_" + str(args['max_vocab_size']) + \
        "_doc_" + str(args['max_document_length']) + \
        "_tok_" + args['tokenizer'].replace('_tokenizer', '')

    print(tfrecord_dir_name)

    if 'pretrained_file' not in args or not args['pretrained_file']:
        tfrecord_dir = os.path.join(tfrecord_dir, tfrecord_dir_name)
        dataset = Dataset(json_dir=json_dir,
                          tfrecord_dir=tfrecord_dir,
                          vocab_dir=tfrecord_dir,
                          text_field_names=args['text_field_names'],
                          label_field_name=args['label_field_name'],
                          label_type=args.get('label_type', 'int'),
                          max_document_length=args['max_document_length'],
                          max_vocab_size=args['max_vocab_size'],
                          min_frequency=args['min_frequency'],
                          max_frequency=args['max_frequency'],
                          train_ratio=args['train_ratio'],
                          valid_ratio=args['valid_ratio'],
                          subsample_ratio=args['subsample_ratio'],
                          padding=args['padding'],
                          write_bow=args['write_bow'],
                          write_tfidf=args['write_tfidf'],
                          tokenizer_=args['tokenizer'],
                          stemmer=args.get('stemmer', None),
                          stopwords=args.get('stopwords', None),
                          generate_basic_vocab=False,
                          vocab_given=False,
                          generate_tf_record=True,
                          preproc=preproc,
                          vocab_all=vocab_all)
    else:
        vocab_path = args['pretrained_file']
        vocab_dir = os.path.dirname(vocab_path)
        vocab_name = os.path.basename(vocab_path)

        pretrained_only = args.get('pretrained_only', False)
        expand_vocab = args.get('expand_vocab', False)

        if pretrained_only:
            suffix = '_only'
        else:
            if expand_vocab:
                suffix = '_expand'
            else:
                suffix = '_init'
        tfrecord_dir = os.path.join(
            tfrecord_dir, tfrecord_dir_name + '_' +
            vocab_name[:max(vocab_name.find('.txt'), vocab_name.
                            find('.bin.gz'), vocab_name.find('.vec.zip'))] +
            suffix)

        dataset = Dataset(json_dir=json_dir,
                          tfrecord_dir=tfrecord_dir,
                          vocab_given=True,
                          vocab_dir=vocab_dir,
                          vocab_name=vocab_name,
                          text_field_names=args['text_field_names'],
                          label_field_name=args['label_field_name'],
                          label_type=args.get('label_type', 'int'),
                          max_document_length=args['max_document_length'],
                          max_vocab_size=args['max_vocab_size'],
                          min_frequency=args['min_frequency'],
                          max_frequency=args['max_frequency'],
                          train_ratio=args['train_ratio'],
                          valid_ratio=args['valid_ratio'],
                          subsample_ratio=args['subsample_ratio'],
                          padding=args['padding'],
                          write_bow=args['write_bow'],
                          write_tfidf=args['write_tfidf'],
                          tokenizer_=args['tokenizer'],
                          stemmer=args['stemmer'],
                          stopwords=args['stopwords'],
                          generate_basic_vocab=False,
                          generate_tf_record=True,
                          expand_vocab=expand_vocab,
                          pretrained_only=pretrained_only,
                          preproc=preproc,
                          vocab_all=vocab_all)

    with open(os.path.join(tfrecord_dir, 'vocab_size.txt'), 'w') as f:
        f.write(str(dataset.vocab_size))

    return tfrecord_dir
Example #6
0
def main(argv):
    if argv[-1].endswith('.json'):
        args_name = argv[-1]
        argv = argv[:-1]
    else:
        args_name = 'args_merged.json'
    args = load_json(args_name)

    tfrecord_dir = "data/tf/merged/"
    datasets = sorted(argv[1:])
    for dataset in datasets[:-1]:
        tfrecord_dir += dataset + "_"
    tfrecord_dir += datasets[-1] + '/'

    json_dirs = [os.path.join('data/json/', dataset) for dataset in datasets]

    preproc = True
    if 'preproc' in args:
        preproc = args['preproc']

    vocab_all = False
    if 'vocab_all' in args:
        vocab_all = args['vocab_all']

    tfrecord_dir_name = \
        "min_" + str(args['min_frequency']) + \
        "_max_" + str(args['max_frequency']) + \
        "_vocab_" + str(args['max_vocab_size']) + \
        "_doc_" + str(args['max_document_length']) + \
        "_tok_" + args['tokenizer'].replace('_tokenizer', '')

    if 'pretrained_file' not in args or not args['pretrained_file']:
        tfrecord_dir = os.path.join(tfrecord_dir, tfrecord_dir_name)
        tfrecord_dirs = [
            os.path.join(tfrecord_dir, dataset) for dataset in datasets
        ]
        assert [os.path.basename(tf_dir) for tf_dir in tfrecord_dirs
                ] == [os.path.basename(json_dir) for json_dir in json_dirs]
        for i in tfrecord_dirs:
            make_dir(i)
        merge_dict_write_tfrecord(
            json_dirs=json_dirs,
            tfrecord_dirs=tfrecord_dirs,
            merged_dir=tfrecord_dir,
            max_document_length=args['max_document_length'],
            max_vocab_size=args['max_vocab_size'],
            min_frequency=args['min_frequency'],
            max_frequency=args['max_frequency'],
            text_field_names=args['text_field_names'],
            label_field_name=args['label_field_name'],
            label_type=args.get('label_type', 'int'),
            train_ratio=args['train_ratio'],
            valid_ratio=args['valid_ratio'],
            tokenizer_=args['tokenizer'],
            stemmer=args['stemmer'],
            stopwords=args['stopwords'],
            subsample_ratio=args['subsample_ratio'],
            padding=args['padding'],
            write_bow=args['write_bow'],
            write_tfidf=args['write_tfidf'],
            preproc=preproc,
            vocab_all=vocab_all)
    else:
        vocab_path = args['pretrained_file']
        vocab_dir = os.path.dirname(vocab_path)
        vocab_name = os.path.basename(vocab_path)
        pretrained_only = args.get('pretrained_only', False)
        expand_vocab = args.get('expand_vocab', False)

        if pretrained_only:
            suffix = '_only'
        else:
            if expand_vocab:
                suffix = '_expand'
            else:
                suffix = '_init'
        tfrecord_dir = os.path.join(
            tfrecord_dir, tfrecord_dir_name + '_' +
            vocab_name[:max(vocab_name.find('.txt'), vocab_name.
                            find('.bin.gz'), vocab_name.find('.vec.zip'))] +
            suffix)

        tfrecord_dirs = [
            os.path.join(tfrecord_dir, dataset) for dataset in datasets
        ]
        for i in tfrecord_dirs:
            make_dir(i)
        assert [os.path.basename(tf_dir) for tf_dir in tfrecord_dirs
                ] == [os.path.basename(json_dir) for json_dir in json_dirs]

        merge_pretrain_write_tfrecord(
            json_dirs=json_dirs,
            tfrecord_dirs=tfrecord_dirs,
            merged_dir=tfrecord_dir,
            vocab_dir=vocab_dir,
            vocab_name=vocab_name,
            text_field_names=args['text_field_names'],
            label_field_name=args['label_field_name'],
            label_type=args.get('label_type', 'int'),
            max_document_length=args['max_document_length'],
            max_vocab_size=args['max_vocab_size'],
            min_frequency=args['min_frequency'],
            max_frequency=args['max_frequency'],
            train_ratio=args['train_ratio'],
            valid_ratio=args['valid_ratio'],
            subsample_ratio=args['subsample_ratio'],
            padding=args['padding'],
            write_bow=args['write_bow'],
            write_tfidf=args['write_tfidf'],
            tokenizer_=args['tokenizer'],
            stemmer=args['stemmer'],
            stopwords=args['stopwords'],
            expand_vocab=expand_vocab,
            pretrained_only=pretrained_only,
            preproc=preproc,
            vocab_all=vocab_all)

    return tfrecord_dir