def __init__(self, captions_path, spectrograms_path, lazy=False): self.lazy = lazy if self.lazy: # Warning: The lazy path does not check whether the cpation metadata # links it to the spectrogram. It assumes that the specrogram data, # read from the files from the path in sorted order, loaded in as # tensors, follows the exact same ordering as the LMD-encoded captions. self.captions = lmd.Reader(captions_path).stream_data( get_meta=False) self.spectrograms = SpectrogramLazyDataset(spectrograms_path) else: self.captions = lmd.Reader(captions_path).stream_data( get_meta=True) self.spectrograms = SpectrogramDataset(spectrograms_path)
def documents(self): self._download() return map( lambda x: (remove_advertisement(x[0]), x[1]), lmd.Reader('components/openwebtext2/openwebtext2.jsonl.zst.tar'). stream_data(get_meta=True))
def documents(self): self._download() return filter( lambda x: len(x[0]) < 100000, lmd.Reader('components/github/github.jsonl.zst.tar').stream_data( get_meta=True))
def documents(self): self._download() yield from map( strip_markdown_colons, lmd.Reader( 'components/pubmedcentral/PMC_extracts.tar.gz').stream_data())
def documents(self): self._download() yield from map( remove_advertisement, lmd.Reader('components/openwebtext2/openwebtext2.jsonl.zst.tar'). stream_data())
def test_tgz_read(): reader = lmd.Reader('test/blns.txt.tar.gz') blns = open('test/blns.txt').read() data = list(reader.stream_data(get_meta=False)) assert data[0] == blns assert len(data) == 1
def test_jsonl_tar(): blns = open('test/blns.txt').read() reader = lmd.Reader('test/blns.jsonl.zst.tar') data = list(reader.stream_data(get_meta=True)) assert data[0] == (blns, {}) assert data[1] == ('testing 123\n\ntesting 345', {'testing': 123}) assert data[2] == (blns, {'testing2': 456, 'testing': ['a', 'b']}) assert data[3] == ('testing 123456789', {}) assert data[4] == (blns, {}) assert data[5] == ('testing 123\n\ntesting 345', {'testing': 123}) assert data[6] == (blns, {'testing2': 456, 'testing': ['a', 'b']}) assert data[7] == ('testing 123456789', {})
def test_jsonl_paras(): archive = lmd.Archive('test_dir') blns = open('test/blns.txt').read() archive.add_data(blns) archive.add_data(['testing 123', 'testing 345'], meta={'testing': 123}) archive.add_data(blns, meta={'testing2': 456, 'testing': ['a', 'b']}) archive.add_data('testing 123456789') archive.commit() reader = lmd.Reader('test_dir') data = list(reader.stream_data(get_meta=True)) assert data[0] == (blns, {}) assert data[1] == ('testing 123\n\ntesting 345', {'testing': 123}) assert data[2] == (blns, {'testing2': 456, 'testing': ['a', 'b']}) assert data[3] == ('testing 123456789', {}) shutil.rmtree('test_dir')
def test_json(): archive = lmd.JSONArchive('test_dir') blns = open('test/blns.txt').read() archive.add_data(blns) archive.add_data('testing 123') archive.add_data(blns) archive.add_data('testing 123456789') archive.commit() reader = lmd.Reader('test_dir') data = list(reader.stream_data()) assert data[0] == blns assert data[1] == 'testing 123' assert data[2] == blns assert data[3] == 'testing 123456789' shutil.rmtree('test_dir')
def compute_perplexity_data(model, data_path, indices=None): # For expedience, we're going to assume everything fits in memory for now # Also for expedience we're just going to save lists of arrays overall_output = { "all_logprobs": [], "all_positions": [], "aggregate_length": 0, "aggregate_utf8_length": 0. } reader = lm_dataformat.Reader(data_path) for i, doc in enumerate(tqdm_lib.tqdm(reader.stream_data())): if indices is not None and i not in indices: continue output = model.get_perplexity_data(doc) if not output: continue overall_output["all_logprobs"].append(output["logprobs"]) overall_output["all_positions"].append(output["positions"]) overall_output["aggregate_length"] += output["length"] overall_output["aggregate_utf8_length"] += output["utf8_length"] return overall_output
def documents(self): self._download() yield from lmd.Reader( 'components/europarl/EuroParliamentProceedings_1996_2011.jsonl.zst' ).stream_data()
def documents(self): self._download() yield from lmd.Reader( 'components/philpapers/PhilArchive.jsonl.zst').stream_data()
def documents(self): self._download() yield from lmd.Reader( 'components/uspto/pile_uspto.jsonl.zst.tar').stream_data()
def meta_items(): rdr = lmd.Reader(f) return pool.imap(analyze, rdr.stream_data(get_meta=True))
def documents(self): self._download() yield from lmd.Reader( 'components/czic/GOVINFO_CZIC_KL.jsonl.zst').stream_data()
def documents(self): self._download() yield from lmd.Reader( 'components/openwebtext/openwebtext').stream_data()
def documents(self): self._download() yield from lmd.Reader( 'components/freelaw/FreeLaw_Opinions.jsonl.zst').stream_data()
def documents(self): self._download() yield from lmd.Reader( 'components/youtubesubtitles/yt_subs.jsonl.zst').stream_data()
def documents(self): self._download() yield from filter( lambda x: len(x) < 100000, lmd.Reader('components/github/github.jsonl.zst.tar').stream_data())
def documents(self): self._download() yield from lmd.Reader( 'components/commoncrawl/pile_cc_filtered.jsonl.zst.tar' ).stream_data()
def documents(self): self._download() yield from lmd.Reader( 'components/ubuntu_irc/ubuntu_irc_until_2020_9_1.jsonl.zst' ).stream_data()
def documents(self): self._download() yield from lmd.Reader('components/cord19/out').stream_data()
def documents(self): self._download() yield from lmd.Reader( 'components/literotica/Literotica.jsonl.zst').stream_data()
def documents(self): self._download() yield from lmd.Reader('components/enron_emails/out').stream_data()
def documents(self): self._download() yield from lmd.Reader('components/hackernews/hn.tar.gz').stream_data()
def documents(self): self._download() yield from lmd.Reader( 'components/exporter/NIH_ExPORTER_awarded_grant_text.jsonl.zst' ).stream_data()
def documents(self): self._download() yield from lmd.Reader('components/arxiv/arxiv.jsonl.zst').stream_data()
def documents(self): self._download() yield from lmd.Reader('components/stackexchange/out').stream_data()
def documents(self): self._download() yield from lmd.Reader('components/opensubtitles/out').stream_data()
def documents(self): self._download() yield from lmd.Reader( 'components/pubmed/PUBMED_title_abstracts_2019_baseline.jsonl.zst' ).stream_data()