def main(): if len(sys.argv) != 5: raise InputError( "Usage: python write_tfrecords_test.py args_test_json_path " "test_json_dir tfrecord_dir vocab_dir") # TODO REFACTOR!!! args_test_path = sys.argv[1] json_dir = sys.argv[2] tfrecord_dir = sys.argv[3] vocab_dir = sys.argv[4] # find the used arguments if os.path.exists(os.path.join(os.path.abspath(vocab_dir), 'args.json')): args_path = os.path.join(os.path.abspath(vocab_dir), 'args.json') else: args_path = os.path.join(vocab_dir, os.listdir(vocab_dir)[0], 'args.json') with open(args_path) as file: args_used = json.load(file) args = load_json(args_path) dataset = Dataset( # keep consistent with the training datasets max_document_length=args_used['max_document_length'], max_vocab_size=args_used['max_vocab_size_allowed'], min_frequency=args_used['min_frequency'], max_frequency=args_used['max_frequency'], padding=args_used.get('padding', args['padding']), write_bow=args_used.get('write_bow', args['write_bow']), write_tfidf=args_used.get('write_tfidf', args['write_tfidf']), tokenizer_=args_used.get('tokenizer', args['tokenizer']), stemmer=args_used.get('stemmer', args['stemmer']), stopwords=args_used.get('stopwords', args['stopwords']), preproc=args_used.get('preproc', args.get('preproc', True)), vocab_all=args_used.get('vocab_all', args.get('vocab_all', False)), # may be different text_field_names=args['text_field_names'], label_field_name=args['label_field_name'], label_type=args.get('label_type', 'int'), # test split only train_ratio=0.0, valid_ratio=0.0, # default in test mode json_dir=json_dir, tfrecord_dir=tfrecord_dir, vocab_dir=vocab_dir, generate_basic_vocab=False, vocab_given=True, vocab_name='vocab_v2i.json', generate_tf_record=True)
def main(): if len(sys.argv) < NUM_ARGS + 1: print('Usage: python qsub_stl_jobs.py ' '<hyperparameter-config-file> <encoder-config-file>') sys.exit(1) config_file = sys.argv[1] config = None with open(config_file) as f: # allow for '//' comments lines = [line for line in f if not line.lstrip().startswith('/')] # print(' '.join(lines)) config = json.loads(' '.join(lines)) encoder_config_file = sys.argv[2] encoder_config = None with open(encoder_config_file) as f: # allow for '//' comments lines = [line for line in f if not line.lstrip().startswith('/')] encoder_config = json.loads(' '.join(lines)) # print(encoder_config) config = load_json(sys.argv[1]) # remove unnecesary arguments finetune_specific_fields = ['checkpoint_dir_init'] if config['mode'] != 'finetune': for i in finetune_specific_fields: if i in config: config.pop(i) # No comments in encod config file generated by get_qsub_encod.py encoder_config = json.load(open(sys.argv[2]))[config['architecture']] meta_config = MetaConfig(config) # config now contains only experimental hyperparameters # (i.e., no "meta" parameters [venv, slots_per_job, etc.]) run_all_experiments(meta_config, config, encoder_config)
def main(): if len(sys.argv) != 5: raise InputError( "Usage: python write_tfrecords_finetune.py dataset_name " "args_finetune_json_path finetune_json_dir vocab_dir") dataset_name = sys.argv[1] args_finetune_path = sys.argv[2] json_dir = sys.argv[3] vocab_dir = sys.argv[4] # find the used arguments if os.path.exists(os.path.join(os.path.abspath(vocab_dir), 'args.json')): args_path = os.path.join(os.path.abspath(vocab_dir), 'args.json') else: args_path = os.path.join(vocab_dir, os.listdir(vocab_dir)[0], 'args.json') with open(args_path) as file: args_used = json.load(file) args = load_json(args_finetune_path) tfrecord_dir = os.path.join("data/tf/single/", dataset_name) # tfrecord_dir_name = \ # "min_" + str(args['min_frequency']) + \ # "_max_" + str(args['max_frequency']) + \ # "_vocab_" + str(args['max_vocab_size']) + \ # "_doc_" + str(args['max_document_length']) + \ # "_tok_" + args['tokenizer'].replace('_tokenizer', '') # tfrecord_dir = os.path.join(tfrecord_dir, tfrecord_dir_name) tfrecord_dir_name = os.path.basename(vocab_dir) tfrecord_dir = os.path.join(tfrecord_dir, tfrecord_dir_name) dataset = Dataset( # TODO keep consistent with the training datasets? max_document_length=args_used['max_document_length'], max_vocab_size=args_used['max_vocab_size_allowed'], min_frequency=args_used['min_frequency'], max_frequency=args_used['max_frequency'], # padding=args_used.get('padding', args['padding']), # write_bow=args_used.get('write_bow', args['write_bow']), # write_tfidf=args_used.get('write_tfidf', args['write_tfidf']), # tokenizer_=args_used.get('tokenizer', args['tokenizer']), # preproc=args_used.get('preproc', args.get('preproc', True)), # vocab_all=args_used.get('vocab_all', args.get('vocab_all', False)), padding=args_used['padding'], write_bow=args_used['write_bow'], write_tfidf=args_used['write_tfidf'], tokenizer_=args_used['tokenizer_'], stemmer=args_used['stemmer'], stopwords=args_used['stopwords'], preproc=args_used['preproc'], vocab_all=args_used['vocab_all'], # may be different text_field_names=args['text_field_names'], label_field_name=args['label_field_name'], label_type=args.get('label_type', 'int'), train_ratio=args['train_ratio'], valid_ratio=args['train_ratio'], # default in finetune mode json_dir=json_dir, tfrecord_dir=tfrecord_dir, vocab_dir=vocab_dir, generate_basic_vocab=False, vocab_given=True, vocab_name='vocab_v2i.json', generate_tf_record=True )
def main(): if len(sys.argv) != 5: raise InputError( "Usage: python write_tfrecords_predict.py dataset_args_path " "predict_json_path predict_tf_path vocab_dir") dataset_args_path = sys.argv[1] predict_json_path = sys.argv[2] predict_tf_path = sys.argv[3] vocab_dir = sys.argv[4] # find the used arguments if os.path.exists(os.path.join(os.path.abspath(vocab_dir), 'args.json')): args_path = os.path.join(os.path.abspath(vocab_dir), 'args.json') else: args_path = os.path.join( vocab_dir, os.listdir(vocab_dir)[0], 'args.json') with open(args_path) as file: args_used = json.load(file) if not os.path.exists(os.path.dirname(predict_tf_path)): make_dir(os.path.dirname(predict_tf_path)) # args_DATASET.json or args_merged.json which has min_freq, max_freq, # max_document_length etc. information, which are used to further build # vocabulary args = load_json(dataset_args_path) print(args) dataset = Dataset( # keep consistent with the training datasets max_document_length=args_used['max_document_length'], max_vocab_size=args_used['max_vocab_size_allowed'], min_frequency=args_used['min_frequency'], max_frequency=args_used['max_frequency'], # padding=args_used.get('padding', args.get('padding', False)), # write_bow=args_used.get('write_bow', args.get('write_bow', False)), # write_tfidf=args_used.get('write_tfidf', args.get('write_tfidf', False)), # tokenizer_=args_used.get('tokenizer', args['tokenizer']), # preproc=args_used.get('preproc', args.get('preproc', True)), # vocab_all=args_used.get('vocab_all', args.get('vocab_all', False)), # use new arguments padding=args.get('padding', args_used.get('padding', False)), write_bow=args.get('write_bow', args_used.get('write_bow', False)), write_tfidf=args.get('write_tfidf', args_used.get('write_tfidf', False)), tokenizer_=args.get('tokenizer', args_used.get('tokenizer_', 'lower_tokenizer')), stemmer=args.get('stemmer', args_used.get('stemmer', 'porter_stemmer')), stopword=args.get('stopwords', args_used.get('stopwords', 'nltk')), preproc=args.get('preproc', args_used.get('preproc', True)), vocab_all=args.get('vocab_all', args_used.get('vocab_all', False)), # may be different text_field_names=args['text_field_names'], label_field_name=args['label_field_name'], label_type=args.get('label_type', 'int'), # default in predict mode json_dir=None, tfrecord_dir=None, vocab_dir=vocab_dir, generate_basic_vocab=False, vocab_given=True, vocab_name='vocab_v2i.json', generate_tf_record=True, predict_mode=True, predict_json_path=predict_json_path, predict_tf_path=predict_tf_path )
def main(): if len(sys.argv) == 2: args_name = 'args_' + sys.argv[1] + '.json' else: args_name = sys.argv[2] args = load_json(args_name) json_dir = "data/json/" + sys.argv[1] tfrecord_dir = os.path.join("data/tf/single/", sys.argv[1]) preproc = True if 'preproc' in args: preproc = args['preproc'] vocab_all = False if 'vocab_all' in args: vocab_all = args['vocab_all'] tfrecord_dir_name = \ "min_" + str(args['min_frequency']) + \ "_max_" + str(args['max_frequency']) + \ "_vocab_" + str(args['max_vocab_size']) + \ "_doc_" + str(args['max_document_length']) + \ "_tok_" + args['tokenizer'].replace('_tokenizer', '') print(tfrecord_dir_name) if 'pretrained_file' not in args or not args['pretrained_file']: tfrecord_dir = os.path.join(tfrecord_dir, tfrecord_dir_name) dataset = Dataset(json_dir=json_dir, tfrecord_dir=tfrecord_dir, vocab_dir=tfrecord_dir, text_field_names=args['text_field_names'], label_field_name=args['label_field_name'], label_type=args.get('label_type', 'int'), max_document_length=args['max_document_length'], max_vocab_size=args['max_vocab_size'], min_frequency=args['min_frequency'], max_frequency=args['max_frequency'], train_ratio=args['train_ratio'], valid_ratio=args['valid_ratio'], subsample_ratio=args['subsample_ratio'], padding=args['padding'], write_bow=args['write_bow'], write_tfidf=args['write_tfidf'], tokenizer_=args['tokenizer'], stemmer=args.get('stemmer', None), stopwords=args.get('stopwords', None), generate_basic_vocab=False, vocab_given=False, generate_tf_record=True, preproc=preproc, vocab_all=vocab_all) else: vocab_path = args['pretrained_file'] vocab_dir = os.path.dirname(vocab_path) vocab_name = os.path.basename(vocab_path) pretrained_only = args.get('pretrained_only', False) expand_vocab = args.get('expand_vocab', False) if pretrained_only: suffix = '_only' else: if expand_vocab: suffix = '_expand' else: suffix = '_init' tfrecord_dir = os.path.join( tfrecord_dir, tfrecord_dir_name + '_' + vocab_name[:max(vocab_name.find('.txt'), vocab_name. find('.bin.gz'), vocab_name.find('.vec.zip'))] + suffix) dataset = Dataset(json_dir=json_dir, tfrecord_dir=tfrecord_dir, vocab_given=True, vocab_dir=vocab_dir, vocab_name=vocab_name, text_field_names=args['text_field_names'], label_field_name=args['label_field_name'], label_type=args.get('label_type', 'int'), max_document_length=args['max_document_length'], max_vocab_size=args['max_vocab_size'], min_frequency=args['min_frequency'], max_frequency=args['max_frequency'], train_ratio=args['train_ratio'], valid_ratio=args['valid_ratio'], subsample_ratio=args['subsample_ratio'], padding=args['padding'], write_bow=args['write_bow'], write_tfidf=args['write_tfidf'], tokenizer_=args['tokenizer'], stemmer=args['stemmer'], stopwords=args['stopwords'], generate_basic_vocab=False, generate_tf_record=True, expand_vocab=expand_vocab, pretrained_only=pretrained_only, preproc=preproc, vocab_all=vocab_all) with open(os.path.join(tfrecord_dir, 'vocab_size.txt'), 'w') as f: f.write(str(dataset.vocab_size)) return tfrecord_dir
def main(argv): if argv[-1].endswith('.json'): args_name = argv[-1] argv = argv[:-1] else: args_name = 'args_merged.json' args = load_json(args_name) tfrecord_dir = "data/tf/merged/" datasets = sorted(argv[1:]) for dataset in datasets[:-1]: tfrecord_dir += dataset + "_" tfrecord_dir += datasets[-1] + '/' json_dirs = [os.path.join('data/json/', dataset) for dataset in datasets] preproc = True if 'preproc' in args: preproc = args['preproc'] vocab_all = False if 'vocab_all' in args: vocab_all = args['vocab_all'] tfrecord_dir_name = \ "min_" + str(args['min_frequency']) + \ "_max_" + str(args['max_frequency']) + \ "_vocab_" + str(args['max_vocab_size']) + \ "_doc_" + str(args['max_document_length']) + \ "_tok_" + args['tokenizer'].replace('_tokenizer', '') if 'pretrained_file' not in args or not args['pretrained_file']: tfrecord_dir = os.path.join(tfrecord_dir, tfrecord_dir_name) tfrecord_dirs = [ os.path.join(tfrecord_dir, dataset) for dataset in datasets ] assert [os.path.basename(tf_dir) for tf_dir in tfrecord_dirs ] == [os.path.basename(json_dir) for json_dir in json_dirs] for i in tfrecord_dirs: make_dir(i) merge_dict_write_tfrecord( json_dirs=json_dirs, tfrecord_dirs=tfrecord_dirs, merged_dir=tfrecord_dir, max_document_length=args['max_document_length'], max_vocab_size=args['max_vocab_size'], min_frequency=args['min_frequency'], max_frequency=args['max_frequency'], text_field_names=args['text_field_names'], label_field_name=args['label_field_name'], label_type=args.get('label_type', 'int'), train_ratio=args['train_ratio'], valid_ratio=args['valid_ratio'], tokenizer_=args['tokenizer'], stemmer=args['stemmer'], stopwords=args['stopwords'], subsample_ratio=args['subsample_ratio'], padding=args['padding'], write_bow=args['write_bow'], write_tfidf=args['write_tfidf'], preproc=preproc, vocab_all=vocab_all) else: vocab_path = args['pretrained_file'] vocab_dir = os.path.dirname(vocab_path) vocab_name = os.path.basename(vocab_path) pretrained_only = args.get('pretrained_only', False) expand_vocab = args.get('expand_vocab', False) if pretrained_only: suffix = '_only' else: if expand_vocab: suffix = '_expand' else: suffix = '_init' tfrecord_dir = os.path.join( tfrecord_dir, tfrecord_dir_name + '_' + vocab_name[:max(vocab_name.find('.txt'), vocab_name. find('.bin.gz'), vocab_name.find('.vec.zip'))] + suffix) tfrecord_dirs = [ os.path.join(tfrecord_dir, dataset) for dataset in datasets ] for i in tfrecord_dirs: make_dir(i) assert [os.path.basename(tf_dir) for tf_dir in tfrecord_dirs ] == [os.path.basename(json_dir) for json_dir in json_dirs] merge_pretrain_write_tfrecord( json_dirs=json_dirs, tfrecord_dirs=tfrecord_dirs, merged_dir=tfrecord_dir, vocab_dir=vocab_dir, vocab_name=vocab_name, text_field_names=args['text_field_names'], label_field_name=args['label_field_name'], label_type=args.get('label_type', 'int'), max_document_length=args['max_document_length'], max_vocab_size=args['max_vocab_size'], min_frequency=args['min_frequency'], max_frequency=args['max_frequency'], train_ratio=args['train_ratio'], valid_ratio=args['valid_ratio'], subsample_ratio=args['subsample_ratio'], padding=args['padding'], write_bow=args['write_bow'], write_tfidf=args['write_tfidf'], tokenizer_=args['tokenizer'], stemmer=args['stemmer'], stopwords=args['stopwords'], expand_vocab=expand_vocab, pretrained_only=pretrained_only, preproc=preproc, vocab_all=vocab_all) return tfrecord_dir