def article_iterator(encoder, final_desired_size=1025): """ Iterate through the provided filename + tokenize""" assert os.path.exists(args.input_fn) with open(args.input_fn, 'r') as f: for l_no, l in enumerate(f): if l_no % args.num_folds == args.fold: article = json.loads(l) article['input_ids'] = tokenize_for_grover_training(encoder, article, desired_size=final_desired_size, unconditional_prob=.35) article['inst_index'] = (l_no // args.num_folds) if article['inst_index'] < 100: print('---\nINPUT{}. {}\n---\nTokens: {}\n'.format(article['inst_index'], detokenize(encoder, article['input_ids']), article['input_ids'] ), flush=True) if len(article['input_ids']) == 0: continue yield article
with S3TFRecordWriter(train_file) as train_writer, S3TFRecordWriter( val_file) as val_writer: for article in buffered_and_sliding_window_article_iterator( encoder, current_desired_size=args.max_seq_length + 1, final_desired_size=max(args.max_seq_length + 1, 1025)): writer2use = train_writer if article['split'] == 'train' else val_writer assert len(article['input_ids']) == (args.max_seq_length + 1) features = collections.OrderedDict() features["input_ids"] = create_int_feature(article['input_ids']) tf_example = tf.train.Example(features=tf.train.Features( feature=features)) writer2use.write(tf_example.SerializeToString()) total_written += 1 # DEBUG if article['inst_index'] < 5: print( "~~~\nSubindex{}. Index {}. ARTICLE: {}\n---\nTokens: {}\n\n". format(article['sub_index'], article['inst_index'], detokenize(encoder, article['input_ids']), article['input_ids']), flush=True) if article['inst_index'] % 1000 == 0: print("{} articles, {} written".format(article['inst_index'], total_written), flush=True) print("DONE UPLOADING", flush=True)