コード例 #1
0
 def download(self):
     if self.DATASET_PATH.exists():
         return
     Path.mkdir(self.DATASET_PATH)
     base_url = "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master"
     splits = [{
         "name":
         "Train",
         "checksum":
         "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"
     }, {
         "name":
         "Eval",
         "checksum":
         "4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f"
     }, {
         "name":
         "Test",
         "checksum":
         "359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701"
     }]
     for split in splits:
         file = self.DATASET_PATH / f"{split['name']}.txt"
         download_file(f"{base_url}/{split['name']}.txt", str(file),
                       split["checksum"])
コード例 #2
0
def reddit_processing(url, sha256sums, dumps_directory, keep_dumps):

    base_name = url.split('/')[-1]
    dump_file_path = os.path.join(dumps_directory, base_name)
    db_done_file = dump_file_path + ".dbdone"

    if os.path.exists(db_done_file):
        return True

    try:
        download_file(url, dump_file_path, sha256sums.get(base_name))
    except Exception as ex:
        logger.info(f"Download failed {ex}, skipping processing.")
        return False

    db_session = get_db_session()
    process_dump_file(dump_file_path, db_session, tqdm.tqdm)

    with open(db_done_file, "w") as fh:
        fh.write("Done!")

    if not keep_dumps:
        os.remove(dump_file_path)

    return True
コード例 #3
0
 def download(self):
     year = self.YEAR
     lang = "EN"
     base_path = (
         "http://nlp.uned.es/clef-qa/repository/js/scripts/downloadFile.php?"
         "file=/var/www/html/nlp/clef-qa/repository/resources/QA4MRE/")
     # TODO: add side tasks?
     variable_year_path = {
         2011: '2011/Training_Data/Goldstandard/',
         2012:
         '2012/Main_Task/Training_Data/Goldstandard/Used_in_Evaluation/',
         2013: '2013/Main_Task/Training_Data/Goldstandard/'
     }
     sha256sums = {
         2011:
         "6d2524952a3a015f2a82df785b85b5578681e3602ec276b4e72c01f4ebc50034",
         2012:
         "f9edaf408f8ac93f89a643a0d0b19263a1bb5ce64f19b2af10df279a656dfb24",
         2013:
         "c60e5aa4ec77e0493ef0b11d46bd1d74d58a499a3a2f871b8cf3af9536f0f094",
     }
     vpath = variable_year_path[year]
     url_path = f"{base_path}{vpath}QA4MRE-{year}-{lang}_GS.xml"
     if not os.path.exists("data/qa4mre"):
         os.mkdir("data/qa4mre")
     if not os.path.isfile(f"data/qa4mre/QA4MRE-{year}-{lang}"):
         download_file(
             url_path,
             f"data/qa4mre/QA4MRE-{year}-{lang}_GS.xml",
             sha256sums[year],
         )
コード例 #4
0
 def download(self):
     file_name, checksum = self.get_file_download_info()
     url = 'https://raw.githubusercontent.com/openai/gpt-3/master/data/' + file_name
     if not os.path.exists(self.directory):
         os.makedirs(self.directory)
     download_file(url, self.directory + file_name, checksum)
     self.set_docs()
コード例 #5
0
def wget(url, to=None, checksum=None):
    # thin wrapper for best_download

    if to is None:
        to = os.path.basename(to)
        if not to: to = 'index'

    download_file(url, to, checksum)
コード例 #6
0
ファイル: pile.py プロジェクト: jon-tow/lm-evaluation-harness
 def download(self):
     # TODO: separate pile val/test out by component so we don't have to scan the entire file once per set
     os.makedirs("data/pile/", exist_ok=True)
     download_file(
         "https://the-eye.eu/public/AI/pile/val.jsonl.zst", self.VAL_PATH,
         "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92")
     download_file(
         "https://the-eye.eu/public/AI/pile/test.jsonl.zst", self.TEST_PATH,
         "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e")
コード例 #7
0
 def download(self):
     if not self.BASE_PATH.exists():
         Path.mkdir(self.BASE_PATH)
     file = self.BASE_PATH / self.FILENAME
     if not file.exists():
         rawfile = file.parent / (file.name + ".gz")
         base_url = "https://raw.githubusercontent.com/openai/gpt-3/master/data"
         download_file(f"{base_url}/{self.FILENAME}.gz", str(rawfile), self.CHECKSUM)
         extract_gzip(gz=rawfile, to=file)
コード例 #8
0
 def download(self):
     if not os.path.exists('data/sciq'):
         os.mkdir('data/sciq')
         download_file(
             'https://ai2-public-datasets.s3.amazonaws.com/sciq/SciQ.zip',
             'data/sciq/SciQ.zip', 
             '7f3312f6ac6b09970b32942d106a8c44ec0dad46a0369f17d635aff8e348a87c',
         )
         with zipfile.ZipFile("data/sciq/SciQ.zip", "r") as zf:
             zf.extractall("data/sciq/")
コード例 #9
0
ファイル: drop.py プロジェクト: jon-tow/lm-evaluation-harness
 def download(self):
     if self.DATASET_PATH.exists():
         return
     Path.mkdir(self.DATASET_PATH)
     url = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip"
     checksum = "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6"
     zip_path = self.DATASET_PATH / "drop_dataset.zip"
     download_file(url, str(zip_path), checksum)
     with ZipFile(zip_path, "r") as zip:
         zip.extractall(self.DATASET_PATH)
コード例 #10
0
 def download(self):
     sh("mkdir -p data/lambada")
     try:
         download_file(
             "http://eaidata.bmk.sh/data/lambada_test.jsonl", 
             "data/lambada/lambada_test.jsonl", 
             "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226"
         )
     except:
         # fallback - for some reason best_download doesnt work all the time here
         sh("wget http://eaidata.bmk.sh/data/lambada_test.jsonl -O data/lambada/lambada_test.jsonl")
         sh('echo "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226  data/lambada/lambada_test.jsonl" | sha256sum --check')
コード例 #11
0
 def download(self):
     if not os.path.exists('data/triviaqa/unfiltered-web-train.jsonl'):
         os.makedirs("data/triviaqa/", exist_ok=True)
         download_file(
             "http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz",
             "data/triviaqa/triviaqa-unfiltered.tar.gz",
             "adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e"
         )
         sh("""
         cd data/triviaqa/
         tar -xf triviaqa-unfiltered.tar.gz
         """)
コード例 #12
0
 def download(self):
     if not os.path.exists('data/ethics/done'):
         sh("mkdir -p data")
         download_file(
             "https://people.eecs.berkeley.edu/~hendrycks/ethics.tar",
             "data/ethics.tar",
             "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"
         )
         sh("""
         tar -xf data/ethics.tar -C data/
         rm data/ethics.tar
         touch data/ethics/done
         """)
コード例 #13
0
 def download(self):
     if not (self.DATASET_PATH /
             'test').exists() or not (self.DATASET_PATH / 'done').exists():
         sh(f"mkdir -p {self.DATASET_PATH}")
         download_file(
             "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar",
             f"{self.DATASET_PATH}.tar",
             "01256fd7cd5430596fdf07e6e6a5827111b5235b7ffed679c662a12f898932da"
         )
         sh(f"""
         tar -xf {self.DATASET_PATH}.tar -C data/ && touch {self.DATASET_PATH / 'done'}
         rm {self.DATASET_PATH}.tar
         """)
コード例 #14
0
ファイル: utils.py プロジェクト: researcher2/The-Pile
def download(fname, checksum, sources, extract=False):
    if os.path.exists(fname + '.done'): return

    print('Finding source for', fname)

    parentdir = Path(fname).parent
    os.makedirs(parentdir, exist_ok=True)

    for source in sources:
        try:
            # todo: implement torrent handling
            if source.type == 'direct':
                download_file(source.url, fname, checksum)
            elif source.type == 'gdrive':
                if os.path.exists(fname):
                    try:
                        print(fname, 'already exists.')
                        sha256sum(fname, expected=checksum)
                        touch(fname + '.done')
                        return
                    except AssertionError:
                        print('{} exists but doesn\'t match checksum!'.format(
                            fname))
                        rm_if_exists(fname)

                gdown.download(source.url, fname, quiet=False)
                sha256sum(fname, expected=checksum)
            elif source.type == 'gcloud':
                raise NotImplementedError('gcloud download not implemented!')

            if extract:
                tar_xf(fname)
                rm_if_exists(fname)
            touch(fname + '.done')
            return
        except SystemExit:
            raise
        except KeyboardInterrupt:
            raise
        except:
            import traceback
            traceback.print_exc()
            print('Download method [{}] {} failed, trying next option'.format(
                source.type, source.url))
            # rm_if_exists(fname)
            continue

        break

    raise Exception('Failed to download {} from any source'.format(fname))
コード例 #15
0
 def download(self):
     if not (self.DATASET_PATH / 'done').exists():
         sh("mkdir -p data")
         download_file(
             "https://people.eecs.berkeley.edu/~hendrycks/data.tar",
             "data/data.tar",
             "78a804365a59028188fb19bd1adcadc5e0c260b220a9d8b2e33a5ea7d5fbe3b4"
         )
         sh("""
         tar -xf data/data.tar -C data/
         rm data/data.tar
         mv data/data data/hendrycksTest
         touch data/hendrycksTest/done
         """)
コード例 #16
0
    def download(self):
        coqa_train_filepath = 'data/coqa/coqa-train-v1.0.json'
        coqa_dev_filepath = 'data/coqa/coqa-dev-v1.0.json'

        sh("""mkdir -p data/coqa""")

        download_file(
            "http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json",
            coqa_train_filepath,
            "b0fdb2bc1bd38dd3ca2ce5fa2ac3e02c6288ac914f241ac409a655ffb6619fa6")
        download_file(
            "http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-dev-v1.0.json",
            coqa_dev_filepath,
            "dfa367a9733ce53222918d0231d9b3bedc2b8ee831a2845f62dfc70701f2540a")
コード例 #17
0
 def download(self):
     if self.BASE_PATH.exists():
         return
     Path.mkdir(self.BASE_PATH, parents=True)
     master_zip = Path("data/master.zip")
     download_file(
         "https://github.com/Nealcly/MuTual/archive/master.zip",
         str(master_zip),
         "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9")
     with zipfile.ZipFile(master_zip, 'r') as zip:
         zip.extractall("data")
     Path("data/MuTual-master/data").rename(str(self.BASE_PATH))
     # Remove left over files and directories.
     master_zip.unlink()
     shutil.rmtree("data/MuTual-master")
コード例 #18
0
 def download(self):
     sh("mkdir -p data/lambada")
     download_file(
         "http://eaidata.bmk.sh/data/lambada_test.jsonl",
         "data/lambada/lambada_test.jsonl",
         "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226")
コード例 #19
0
in_path = 'pile'
out_path = 'langlen_stage1'


def lengths(doc):
    global tok
    return {
        'len_char': len(doc),
        'len_utf8bytes': len(doc.encode('utf-8')),
        'len_words': len(re.split(r'\s+', doc)),
        'len_tokens': len(tok.encode(doc)),
    }


download_file(
    'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin',
    'lid.176.bin',
    '7e69ec5451bc261cc7844e49e4792a85d7f09c06789ec800fc4a44aec362764e')


def language(doc):
    global langdet
    details = langdet.predict(doc.replace('\n', ' '), k=1)

    return {'lang': details[0][0].replace('__label__', '')}


def writef(f, lines):
    with open(f, 'wb') as fh:
        cctx = zstandard.ZstdCompressor(level=3, threads=8)
        compressor = cctx.stream_writer(fh)
        for line in tqdm(lines):
コード例 #20
0
 def download(self):
     if not os.path.exists('data/wikitext/wikitext-2-raw/wiki.valid.raw'):
         os.makedirs("data/wikitext/", exist_ok=True)
         download_file("https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip", "data/wikitext/wikitext-2-raw-v1.zip", "ef7edb566e3e2b2d31b29c1fdb0c89a4cc683597484c3dc2517919c615435a11")
         sh("cd data/wikitext/ && unzip wikitext-2-raw-v1.zip")
コード例 #21
0
 def download(self):
     sh("mkdir -p data/lambada")
     download_file(
         "https://storage.googleapis.com/gpt-2/data/lambada_test.jsonl",
         "data/lambada/lambada_test.jsonl",
         "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226")