def archive_to_tokens(f, encoder, args): # Generator that yields the contents of the files in an archive # if data_to_prepend is not None, prepend data_to_prepend + a EOS separator to the encoded data reader = Reader(f) for doc in reader.stream_data(threaded=False): if args.ftfy: # fix text with ftfy if specified doc = ftfy.fix_text(doc, normalization='NFKC') doc = encoder.encode(doc) + args.separator # read document from lmd and append separator token yield split_list(doc, args.chunk_size) # split into n_ctx + 1 size chunks
def archive_to_tokens(f, encoder, args): # Generator that yields the contents of the files in an archive # if data_to_prepend is not None, prepend data_to_prepend + a EOS separator to the encoded data reader = Reader(f) for doc in reader.stream_data(threaded=False): # doc = [int(i) for i in doc.split(", ")] + args.separator # for testing doc = encoder.encode( doc ) + args.separator # read document from lmd and append separator token yield split_list(doc, args.chunk_size) # split into n_ctx + 1 size chunks
def archive_to_tokens(f, encoder, args, prefix=[]): # Generator that yields the contents of the files in an archive # if data_to_prepend is not None, prepend data_to_prepend + a EOS separator to the encoded data reader = Reader(f) for file_doc in reader.stream_data(threaded=False): for doc in split_on_interior_eot(file_doc, encoder, disable=args.treat_eot_as_text): if args.normalize_with_ftfy: # fix text with ftfy if specified doc = ftfy.fix_text(doc, normalization='NFKC') if args.normalize_with_wikitext_detokenize: doc = wikitext_detokenizer(doc) doc = encoder.encode(doc) + [encoder.eos_token_id] # read document from lmd and append separator token yield split_list(prefix + doc, 2049) # split into n_ctx + 1 size chunks prefix = []
def archive_to_tokens(f, encoder, args): # Generator that yields the contents of the files in an archive # if data_to_prepend is not None, prepend data_to_prepend + a EOS separator to the encoded data reader = Reader(f) for doc in reader.stream_data(threaded=False): if args.ftfy: # fix text with ftfy if specified doc = ftfy.fix_text(doc, normalization='NFKC') # During training we will then just shift iids by 1 to get the labels # Hence all chunk_size + 1 tokens will be used; Sep Id only serves to separate documents doc = encoder.encode(doc, add_special_tokens=False) + args.sep_id # read document from lmd and append separator token yield split_list(doc, args.seq_len) # split into n_ctx + 1 size chunks
def read(input_dir_or_file): """ Read a poem from the final jsonl.zst object """ rdr = Reader(input_dir_or_file) for doc in os.listdir("out"): poem = "" for l, meta in rdr.read_jsonl("out/{}".format(doc), get_meta=True): poem += l print('=====') print(meta) print(poem) print('=====')
def _archive_to_files(f): # Generator that yields the contents of the files in an archive g = Reader(f).stream_data(threaded=False) for s in g: yield BufferedEncodedStream(s, enc, [], not args.no_ftfy, args.minimum_size, text_mode=True).read()
def _gen(): # Generator yielding the files inside the archive as encoded streams g = Reader(f).stream_data(threaded=False) for s in g: yield BufferedEncodedStream(s, encoder, separator, fix, minimum_size, text_mode=True)
archives = glob(str(data_path / f"*.{args.file_type}")) out_path = Path(args.output_dir) if os.path.exists(out_path): shutil.rmtree(out_path) if not out_path.is_dir(): out_path.mkdir() for arch in tqdm(archives): name = os.path.basename(arch).split(".")[0] + ".txt" fp = out_path / name if args.file_type == 'xz': g = Reader(arch).stream_data() with open(fp, "w") as f: for s in g: f.write(s) f.write("\n\n") elif args.file_type == 'txt': shutil.copyfile(str(arch), str(fp)) data_files = glob(str(out_path / "*.txt")) data_files = random.sample(data_files, int(0.2 * len(data_files))) assert len(data_files) > 0, 'No data files found' # Initialize a tokenizer tokenizer = Tokenizer(models.BPE())
def limit(x, num): try: for _ in range(num): yield next(x) except StopIteration: return doc_ct = 100000 if __name__ == '__main__': num_owt = 0 with open('fasttext_training.txt', 'w') as fout: print('Processing OWT data') wt_rdr = Reader('../openwebtext') for doc in tqdm(limit(wt_rdr.stream_data(), doc_ct)): fout.write('__label__owt ' + preprocess_for_fasttext(doc) + '\n') num_owt += 1 print('Processed', num_owt, 'documents from OWT') print('Processing CC data') for doc in tqdm(limit(get_cc_docs(get_seg_urls(0)), doc_ct), total=doc_ct): fout.write('__label__cc ' + preprocess_for_fasttext(doc) + '\n') print('Training fasttext') model = fasttext.train_supervised(input="fasttext_training.txt") model.save_model('fasttext_filter.bin')