def process(start=MIN_RECORD_NUMBER, end=MAX_RECORD_NUMBER, batch_size=2**8): try: with open("counts.dat", "r") as cf: watermark = int(cf.readlines()[-1].rstrip()) + 1 except: watermark = start try: with open("urls.dat", "r") as uf: url_list = [line.rstrip() for line in uf.readlines()] except: url_list = [] record_indices = [number for number in range(max(start, watermark), end)] with open("counts.dat", "a+") as cf, open("urls.dat", "a+") as uf: index_batches = chunks(record_indices, batch_size) for batch in index_batches: try: batch_index_list, batch_url_list = scrape_pdf_list(batch) except NetworkException: sys.exit("Error accessing records. Shutting down...") except ScrapingException: sys.exit( "Error scraping records. May be due to rate-limiting. Please try again later, or with a proxy." ) cf.writelines([item + '\n' for item in batch_index_list]) uf.writelines([item + '\n' for item in batch_url_list]) url_list.extend(batch_url_list) try: with open("processed.dat", "r") as pf: last_processed = pf.readlines()[-1].rstrip() except: last_processed = None if last_processed: last_processed_index = url_list.index(last_processed) url_list = [ line.rstrip() for line in url_list[last_processed_index + 1:] ] archive = lmd.Archive('out') with open("processed.dat", "a+") as pf: url_batches = chunks(url_list, batch_size) for batch in url_batches: print("Scraping PDF batch...") batch_pdf_list = scrape_pdfs(batch) print("Processing PDF batch...") process_pdfs(batch_pdf_list, archive) pf.writelines([item + '\n' for item in batch])
def test_jsonl_paras(): archive = lmd.Archive('test_dir') blns = open('test/blns.txt').read() archive.add_data(blns) archive.add_data(['testing 123', 'testing 345'], meta={'testing': 123}) archive.add_data(blns, meta={'testing2': 456, 'testing': ['a', 'b']}) archive.add_data('testing 123456789') archive.commit() reader = lmd.Reader('test_dir') data = list(reader.stream_data(get_meta=True)) assert data[0] == (blns, {}) assert data[1] == ('testing 123\n\ntesting 345', {'testing': 123}) assert data[2] == (blns, {'testing2': 456, 'testing': ['a', 'b']}) assert data[3] == ('testing 123456789', {}) shutil.rmtree('test_dir')
if args.n_stars != -1: repo_data = filter_by_stars(repo_data, args.n_stars) repo_data.sort() random.seed(420) random.shuffle(repo_data) n_threads = cpu_count() * 3 if args.n_threads == -1 else args.n_threads chunk_size = n_threads * 3 if args.chunk_size == -1 else args.chunk_size assert n_threads != 0 # do work repo_chunks = split_into_chunks(repo_data, chunk_size) archive_name = 'github_data' ar = lmd.Archive(archive_name) pool = Pool(n_threads) pbar = tqdm(repo_chunks, total=len(repo_chunks)) success_hist = [] for count, chunk in enumerate(pbar): repos_out = pool.starmap(process_repo_list, zip(chunk, repeat(args.clone_timeout), repeat(args.processing_timeout))) not_none = 0 none = 0 for repo in repos_out: if repo is not None: not_none += 1 for f in repo: ar.add_data(f[0], meta=f[1]) else: none += 1
return date, chan, res pool = mp.Pool(20) pool2 = mp.Pool(20) def documents(): dates = list(datesbetween(start, end)) for date, channels in tqdm(pool2.imap(channels_on_day, dates), total=len(dates)): for chan in channels: yield (date, chan) for date, chan, content in pool.imap(get_logs_for, documents()): if len(exclude_system(content)) > 0: content = clean(exclude_select_system(content)) #print(date, chan) logs[(chan, date.month )] += f'#{chan} {date.year}-{date.month:02d}-{date.day:02d}\n' logs[(chan, date.month)] += content #if chan == 'ubuntu': print(content) ar = lmd.Archive('out') for (chan, month), content in logs.items(): ar.add_data(content, meta={'channel': chan, 'month': month}) ar.commit()
if r < ptr: raise AssertionError() result.append(txt[ptr:]) result = list(filter(lambda x: len(x) > 200, result)) if len(result) > 10: return [] return result chunk_docs = 50000 dsets = [ ('output_pile', '00.jsonl.zst'), ('output_owt', '/data/datasets/openwebtext'), ] for outdir, source in dsets: ar = lmd.Archive(outdir) for i, doc in enumerate(tqdm(lmd.Reader(source).stream_data())): for piece in process_doc(doc): ar.add_data(piece) if (i + 1) % chunk_docs == 0: ar.commit() ar.commit()
def write_doc(self, doc, meta): lang = meta['primary_language'] if lang not in self.ars: self.ars[lang] = lmd.Archive(f'output/{lang}', compression_level=7) self.ct_by_lang[lang] += 1 self.total_docs += 1
def dl(x): try: crl = curl(x[0]) if not crl: return None #print('downloaded', x[0]) return (crl, x[1]) except: return None pool = mp.Pool(128) from tqdm import tqdm pairs = pool.imap(dl, dl_imgs()) >> filter(id) batch_size = 256 pairs = pairs >> filter(X[0]) >> each(pointwise(BytesIO, id)) >> each(pointwise(swallow_errors(Image.open), id)) >> filter(X[0]) imlats = pairs >> do(partial(thread_preload, buffer_size=100000)) >> chunks(batch_size) >> each(lambda x: zip(*x)) >> each(pointwise(clip_encode_img, id)) >> each(lambda x: zip(*x)) >> join() import lm_dataformat as lmd ar = lmd.Archive(f"chunk{ind}") for imlat, text in imlats >> do(partial(tqdm, total=500000)): ar.add_data(text, imlat) ar.commit() sh(f"mv chunk{ind}/*.jsonl.zst clip_latents_chunk{ind}.jsonl.zst")
classifications = metadata['classifications'] classifications.pop('classifications_ipcr_list', None) metadata['classifications'] = classifications text_list = None if 'detailed_description' not in datum: continue section = datum['detailed_description'] # List out the sub-headings within the detailed description. subheadings = list(section.keys()) # Background section may have a variable name background_headings = [tag for tag in subheadings if tag and 'BACKGROUND' in tag] if len(background_headings) < 1: continue else: background_heading = background_headings[0] text_list = section[background_heading] # Occasionally, you'll come across empty sections. if len(text_list) > 0: text = '\n'.join(text_list) archive.add_data(text, meta=metadata) archive.commit(archive_name=str(year)) return archive archive = lmd.Archive('out') archive = extract_pre_2002(archive) archive = extract_2002_to_2004(archive) archive = extract_post_2004(archive)
def process_repo_list(repo_data, archive_name='github_data'): ar = lmd.Archive(archive_name) for i, repo in enumerate(tqdm(repo_data)): name, stars, lang = repo meta = {'repo_name': name, 'stars': stars, 'repo_language': lang} repodir = f'./.tmp/{name.split("/")[-1]}' os.system(f'git clone --depth 1 --single-branch https://github.com/{name} {repodir}') shutil.rmtree(f'{repodir}/.git', ignore_errors=True) for curdir, dirs, files in os.walk(repodir): bad_extensions = [ 'app', 'bin', 'bmp', 'bz2', 'class', 'csv', 'dat', 'db', 'dll', 'dylib', 'egg', 'eot', 'exe', 'gif', 'gitignore', 'glif', 'gradle', 'gz', 'ico', 'jar', 'jpeg', 'jpg', 'lo', 'lock', 'log', 'mp3', 'mp4', 'nar', 'o', 'ogg', 'otf', 'p', 'pdf', 'png', 'pickle', 'pkl', 'pyc', 'pyd', 'pyo', 'rkt', 'so', 'ss', 'svg', 'tar', 'tsv', 'ttf', 'war', 'webm', 'woff', 'woff2', 'xz', 'zip', 'zst' ] files = [curdir + '/' + f for f in files if '.git' not in f and f[ 0] is not '.' and 'LICENSE' not in f and 'node_modules' not in f and '.min.' not in f and f.split('.')[ -1] not in bad_extensions] filenames = [f.split("/")[-1] for f in files] extensions = [mime.from_file(f) for f in files] text_outputs = list(map(get_content, files)) for i in range(len(files)): text = text_outputs[i] if text is not None: meta['file_name'] = filenames[i] meta['mime_type'] = extensions[i] ar.add_data(text, meta) shutil.rmtree(repodir, ignore_errors=True) if (i + 1) % 100 == 0: ar.commit() ar.commit()