Exemple #1
0
 def archive_to_tokens(f, encoder, args):
     # Generator that yields the contents of the files in an archive
     # if data_to_prepend is not None, prepend data_to_prepend + a EOS separator to the encoded data
     reader = Reader(f)
     for doc in reader.stream_data(threaded=False):
         if args.ftfy: # fix text with ftfy if specified
             doc = ftfy.fix_text(doc, normalization='NFKC')
         doc = encoder.encode(doc) + args.separator # read document from lmd and append separator token
         yield split_list(doc, args.chunk_size) # split into n_ctx + 1 size chunks
Exemple #2
0
def archive_to_tokens(f, encoder, args):
    # Generator that yields the contents of the files in an archive
    # if data_to_prepend is not None, prepend data_to_prepend + a EOS separator to the encoded data
    reader = Reader(f)
    for doc in reader.stream_data(threaded=False):
        # doc = [int(i) for i in doc.split(", ")] + args.separator # for testing
        doc = encoder.encode(
            doc
        ) + args.separator  # read document from lmd and append separator token
        yield split_list(doc,
                         args.chunk_size)  # split into n_ctx + 1 size chunks
Exemple #3
0
def archive_to_tokens(f, encoder, args, prefix=[]):
    # Generator that yields the contents of the files in an archive
    # if data_to_prepend is not None, prepend data_to_prepend + a EOS separator to the encoded data
    reader = Reader(f)
    for file_doc in reader.stream_data(threaded=False):
        for doc in split_on_interior_eot(file_doc, encoder, disable=args.treat_eot_as_text):
            if args.normalize_with_ftfy:  # fix text with ftfy if specified
                doc = ftfy.fix_text(doc, normalization='NFKC')
            if args.normalize_with_wikitext_detokenize:
                doc = wikitext_detokenizer(doc)
            doc = encoder.encode(doc) + [encoder.eos_token_id]  # read document from lmd and append separator token
            yield split_list(prefix + doc, 2049)  # split into n_ctx + 1 size chunks
            prefix = []
Exemple #4
0
def archive_to_tokens(f, encoder, args):
    # Generator that yields the contents of the files in an archive
    # if data_to_prepend is not None, prepend data_to_prepend + a EOS separator to the encoded data
    reader = Reader(f)
    for doc in reader.stream_data(threaded=False):
        if args.ftfy: # fix text with ftfy if specified
            doc = ftfy.fix_text(doc, normalization='NFKC')
        
        # During training we will then just shift iids by 1 to get the labels
        # Hence all chunk_size + 1 tokens will be used; Sep Id only serves to separate documents
        doc = encoder.encode(doc, add_special_tokens=False) + args.sep_id # read document from lmd and append separator token

        yield split_list(doc, args.seq_len) # split into n_ctx + 1 size chunks
def read(input_dir_or_file):
    """
    Read a poem from the final jsonl.zst object
    """
    rdr = Reader(input_dir_or_file)
    for doc in os.listdir("out"):
        poem = ""
        for l, meta in rdr.read_jsonl("out/{}".format(doc), get_meta=True):
            poem += l

        print('=====')
        print(meta)
        print(poem)
        print('=====')
Exemple #6
0
 def _archive_to_files(f):
     # Generator that yields the contents of the files in an archive
     g = Reader(f).stream_data(threaded=False)
     for s in g:
         yield BufferedEncodedStream(s,
                                     enc, [],
                                     not args.no_ftfy,
                                     args.minimum_size,
                                     text_mode=True).read()
Exemple #7
0
 def _gen():
     # Generator yielding the files inside the archive as encoded streams
     g = Reader(f).stream_data(threaded=False)
     for s in g:
         yield BufferedEncodedStream(s,
                                     encoder,
                                     separator,
                                     fix,
                                     minimum_size,
                                     text_mode=True)
Exemple #8
0
  archives = glob(str(data_path / f"*.{args.file_type}"))

  out_path = Path(args.output_dir)

  if os.path.exists(out_path):
      shutil.rmtree(out_path)

  if not out_path.is_dir():
      out_path.mkdir()

      for arch in tqdm(archives):
          name = os.path.basename(arch).split(".")[0] + ".txt"
          fp = out_path / name

          if args.file_type == 'xz':
              g = Reader(arch).stream_data()

              with open(fp, "w") as f:
                  for s in g:
                      f.write(s)
                      f.write("\n\n")
          elif args.file_type == 'txt':
              shutil.copyfile(str(arch), str(fp))

  data_files = glob(str(out_path / "*.txt"))
  data_files = random.sample(data_files, int(0.2 * len(data_files)))

  assert len(data_files) > 0, 'No data files found'

  # Initialize a tokenizer
  tokenizer = Tokenizer(models.BPE())
def limit(x, num):
    try:
        for _ in range(num):
            yield next(x)
    except StopIteration:
        return


doc_ct = 100000

if __name__ == '__main__':
    num_owt = 0

    with open('fasttext_training.txt', 'w') as fout:
        print('Processing OWT data')
        wt_rdr = Reader('../openwebtext')
        for doc in tqdm(limit(wt_rdr.stream_data(), doc_ct)):
            fout.write('__label__owt ' + preprocess_for_fasttext(doc) + '\n')
            num_owt += 1

        print('Processed', num_owt, 'documents from OWT')
        
        print('Processing CC data')
        for doc in tqdm(limit(get_cc_docs(get_seg_urls(0)), doc_ct), total=doc_ct):
            fout.write('__label__cc ' + preprocess_for_fasttext(doc) + '\n')

    print('Training fasttext')
    model = fasttext.train_supervised(input="fasttext_training.txt")
    model.save_model('fasttext_filter.bin')