Beispiel #1
0
def process(start=MIN_RECORD_NUMBER, end=MAX_RECORD_NUMBER, batch_size=2**8):
    try:
        with open("counts.dat", "r") as cf:
            watermark = int(cf.readlines()[-1].rstrip()) + 1
    except:
        watermark = start

    try:
        with open("urls.dat", "r") as uf:
            url_list = [line.rstrip() for line in uf.readlines()]
    except:
        url_list = []

    record_indices = [number for number in range(max(start, watermark), end)]

    with open("counts.dat", "a+") as cf, open("urls.dat", "a+") as uf:
        index_batches = chunks(record_indices, batch_size)
        for batch in index_batches:
            try:
                batch_index_list, batch_url_list = scrape_pdf_list(batch)
            except NetworkException:
                sys.exit("Error accessing records. Shutting down...")
            except ScrapingException:
                sys.exit(
                    "Error scraping records. May be due to rate-limiting. Please try again later, or with a proxy."
                )
            cf.writelines([item + '\n' for item in batch_index_list])
            uf.writelines([item + '\n' for item in batch_url_list])
            url_list.extend(batch_url_list)

    try:
        with open("processed.dat", "r") as pf:
            last_processed = pf.readlines()[-1].rstrip()
    except:
        last_processed = None

    if last_processed:
        last_processed_index = url_list.index(last_processed)
        url_list = [
            line.rstrip() for line in url_list[last_processed_index + 1:]
        ]

    archive = lmd.Archive('out')

    with open("processed.dat", "a+") as pf:
        url_batches = chunks(url_list, batch_size)
        for batch in url_batches:
            print("Scraping PDF batch...")
            batch_pdf_list = scrape_pdfs(batch)
            print("Processing PDF batch...")
            process_pdfs(batch_pdf_list, archive)
            pf.writelines([item + '\n' for item in batch])
def test_jsonl_paras():
    archive = lmd.Archive('test_dir')
    blns = open('test/blns.txt').read()
    archive.add_data(blns)
    archive.add_data(['testing 123', 'testing 345'], meta={'testing': 123})
    archive.add_data(blns, meta={'testing2': 456, 'testing': ['a', 'b']})
    archive.add_data('testing 123456789')
    archive.commit()

    reader = lmd.Reader('test_dir')

    data = list(reader.stream_data(get_meta=True))

    assert data[0] == (blns, {})
    assert data[1] == ('testing 123\n\ntesting 345', {'testing': 123})
    assert data[2] == (blns, {'testing2': 456, 'testing': ['a', 'b']})
    assert data[3] == ('testing 123456789', {})
    shutil.rmtree('test_dir')
    if args.n_stars != -1:
        repo_data = filter_by_stars(repo_data, args.n_stars)
    repo_data.sort()

    random.seed(420)
    random.shuffle(repo_data)

    n_threads = cpu_count() * 3 if args.n_threads == -1 else args.n_threads
    chunk_size = n_threads * 3 if args.chunk_size == -1 else args.chunk_size

    assert n_threads != 0

    # do work
    repo_chunks = split_into_chunks(repo_data, chunk_size)
    archive_name = 'github_data'
    ar = lmd.Archive(archive_name)
    pool = Pool(n_threads)
    pbar = tqdm(repo_chunks, total=len(repo_chunks))
    success_hist = []
    for count, chunk in enumerate(pbar):
        repos_out = pool.starmap(process_repo_list,
                                 zip(chunk, repeat(args.clone_timeout), repeat(args.processing_timeout)))
        not_none = 0
        none = 0
        for repo in repos_out:
            if repo is not None:
                not_none += 1
                for f in repo:
                    ar.add_data(f[0], meta=f[1])
            else:
                none += 1
Beispiel #4
0
    return date, chan, res


pool = mp.Pool(20)
pool2 = mp.Pool(20)


def documents():
    dates = list(datesbetween(start, end))
    for date, channels in tqdm(pool2.imap(channels_on_day, dates),
                               total=len(dates)):
        for chan in channels:
            yield (date, chan)


for date, chan, content in pool.imap(get_logs_for, documents()):
    if len(exclude_system(content)) > 0:
        content = clean(exclude_select_system(content))
        #print(date, chan)
        logs[(chan, date.month
              )] += f'#{chan} {date.year}-{date.month:02d}-{date.day:02d}\n'
        logs[(chan, date.month)] += content
        #if chan == 'ubuntu': print(content)

ar = lmd.Archive('out')

for (chan, month), content in logs.items():
    ar.add_data(content, meta={'channel': chan, 'month': month})

ar.commit()
Beispiel #5
0
        if r < ptr:
            raise AssertionError()
    
    result.append(txt[ptr:])

    result = list(filter(lambda x: len(x) > 200, result))

    if len(result) > 10: return []

    return result


chunk_docs = 50000


dsets = [
    ('output_pile', '00.jsonl.zst'),
    ('output_owt', '/data/datasets/openwebtext'),
]

for outdir, source in dsets:
    ar = lmd.Archive(outdir)
    for i, doc in enumerate(tqdm(lmd.Reader(source).stream_data())):
        for piece in process_doc(doc):
            ar.add_data(piece)

        if (i + 1) % chunk_docs == 0:
            ar.commit()

    ar.commit()
        
 def write_doc(self, doc, meta):
     lang = meta['primary_language']
     if lang not in self.ars:
         self.ars[lang] = lmd.Archive(f'output/{lang}', compression_level=7)
     self.ct_by_lang[lang] += 1
     self.total_docs += 1
Beispiel #7
0
def dl(x):
    try:
        crl = curl(x[0])
        if not crl: return None
        #print('downloaded', x[0])
        return (crl, x[1])
    except:
        return None

pool = mp.Pool(128)

from tqdm import tqdm

pairs = pool.imap(dl, dl_imgs()) >> filter(id)
batch_size = 256

pairs = pairs >> filter(X[0]) >> each(pointwise(BytesIO, id)) >> each(pointwise(swallow_errors(Image.open), id)) >> filter(X[0])

imlats = pairs >> do(partial(thread_preload, buffer_size=100000)) >> chunks(batch_size) >> each(lambda x: zip(*x)) >> each(pointwise(clip_encode_img, id)) >> each(lambda x: zip(*x)) >> join()

import lm_dataformat as lmd

ar = lmd.Archive(f"chunk{ind}")

for imlat, text in imlats >> do(partial(tqdm, total=500000)):
    ar.add_data(text, imlat)

ar.commit()

sh(f"mv chunk{ind}/*.jsonl.zst clip_latents_chunk{ind}.jsonl.zst")
Beispiel #8
0
          classifications = metadata['classifications']
          classifications.pop('classifications_ipcr_list', None)
          metadata['classifications'] = classifications
        text_list = None
        if 'detailed_description' not in datum:
          continue
        section = datum['detailed_description']
        # List out the sub-headings within the detailed description.
        subheadings = list(section.keys())

        # Background section may have a variable name
        background_headings = [tag for tag in subheadings if tag and 'BACKGROUND' in tag]

        if len(background_headings) < 1:
          continue
        else:
          background_heading = background_headings[0]
          text_list = section[background_heading]

        # Occasionally, you'll come across empty sections.
        if len(text_list) > 0:
          text = '\n'.join(text_list)
          archive.add_data(text, meta=metadata)
    archive.commit(archive_name=str(year))
  return archive

archive = lmd.Archive('out')
archive = extract_pre_2002(archive)
archive = extract_2002_to_2004(archive)
archive = extract_post_2004(archive)
Beispiel #9
0
def process_repo_list(repo_data, archive_name='github_data'):
    ar = lmd.Archive(archive_name)

    for i, repo in enumerate(tqdm(repo_data)):
        name, stars, lang = repo
        meta = {'repo_name': name, 'stars': stars, 'repo_language': lang}
        repodir = f'./.tmp/{name.split("/")[-1]}'
        os.system(f'git clone --depth 1 --single-branch https://github.com/{name} {repodir}')
        shutil.rmtree(f'{repodir}/.git', ignore_errors=True)

        for curdir, dirs, files in os.walk(repodir):
            bad_extensions = [
                'app',
                'bin',
                'bmp',
                'bz2',
                'class',
                'csv',
                'dat',
                'db',
                'dll',
                'dylib',
                'egg',
                'eot',
                'exe',
                'gif',
                'gitignore',
                'glif',
                'gradle',
                'gz',
                'ico',
                'jar',
                'jpeg',
                'jpg',
                'lo',
                'lock',
                'log',
                'mp3',
                'mp4',
                'nar',
                'o',
                'ogg',
                'otf',
                'p',
                'pdf',
                'png',
                'pickle',
                'pkl',
                'pyc',
                'pyd',
                'pyo',
                'rkt',
                'so',
                'ss',
                'svg',
                'tar',
                'tsv',
                'ttf',
                'war',
                'webm',
                'woff',
                'woff2',
                'xz',
                'zip',
                'zst'
            ]

            files = [curdir + '/' + f for f in files if '.git' not in f and f[
                0] is not '.' and 'LICENSE' not in f and 'node_modules' not in f and '.min.' not in f and f.split('.')[
                         -1] not in bad_extensions]

            filenames = [f.split("/")[-1] for f in files]
            extensions = [mime.from_file(f) for f in files]
            text_outputs = list(map(get_content, files))
            for i in range(len(files)):
                text = text_outputs[i]
                if text is not None:
                    meta['file_name'] = filenames[i]
                    meta['mime_type'] = extensions[i]

                    ar.add_data(text, meta)

        shutil.rmtree(repodir, ignore_errors=True)
        if (i + 1) % 100 == 0:
            ar.commit()

    ar.commit()