Ejemplo n.º 1
0
def build(opt):
    version = '0.2'
    dpath = os.path.join(opt['datapath'], 'ConvAI2_wild_evaluation')

    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')

        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        for downloadable_file in RESOURCES:
            downloadable_file.download_file(dpath)

        output_fname = 'convai2_wild_evaluation.json'
        output_path = os.path.join(dpath, output_fname)

        with PathManager.open(output_path, 'r') as data_f:
            data = json.load(data_f)

        make_parlai_format(data, dpath)
        PathManager.rm(output_path)

        # Mark the data as built.
        build_data.mark_done(dpath, version)
Ejemplo n.º 2
0
def _unzip(path, fname, delete=True):
    """
    Unpack the given zip file to the same directory.

    :param str path:
        The folder containing the archive. Will contain the contents.

    :param str fname:
        The filename of the archive file.

    :param bool delete:
        If true, the archive will be deleted after extraction.
    """
    import zipfile

    logging.debug(f'unpacking {fname}')
    fullpath = os.path.join(path, fname)
    with zipfile.ZipFile(PathManager.open(fullpath, 'rb'), 'r') as zf:
        for member in zf.namelist():
            outpath = os.path.join(path, member)
            if zf.getinfo(member).is_dir():
                logging.debug(f"Making directory {outpath}")
                PathManager.mkdirs(outpath)
                continue
            logging.debug(f"Extracting to {outpath}")
            with zf.open(member, 'r') as inf, PathManager.open(outpath,
                                                               'wb') as outf:
                shutil.copyfileobj(inf, outf)
    if delete:
        try:
            PathManager.rm(fullpath)
        except PermissionError:
            logging.error(
                f"Tried to delete {fullpath} but got a permission error. This "
                "is known to happen in Windows and is probably not fatal.")
Ejemplo n.º 3
0
def _unzip(path, fname, delete=True):
    """
    Unpack the given zip file to the same directory.

    :param str path:
        The folder containing the archive. Will contain the contents.

    :param str fname:
        The filename of the archive file.

    :param bool delete:
        If true, the archive will be deleted after extraction.
    """
    import zipfile

    logging.debug(f'unpacking {fname}')
    fullpath = os.path.join(path, fname)
    with zipfile.ZipFile(PathManager.open(fullpath, 'rb'), 'r') as zf:
        for member in zf.namelist():
            outpath = os.path.join(path, member)
            if zf.getinfo(member).is_dir():
                logging.debug(f"Making directory {outpath}")
                PathManager.mkdirs(outpath)
                continue
            logging.debug(f"Extracting to {outpath}")
            with zf.open(member, 'r') as inf, PathManager.open(outpath,
                                                               'wb') as outf:
                shutil.copyfileobj(inf, outf)
    if delete:
        PathManager.rm(fullpath)
Ejemplo n.º 4
0
def build(opt):
    version = 'v1.1'
    dpath = os.path.join(opt['datapath'], 'Twitter')

    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        for downloadable_file in RESOURCES:
            downloadable_file.download_file(dpath)

        file1 = os.path.join(dpath, RESOURCES[0].file_name)
        file2 = os.path.join(dpath, RESOURCES[1].file_name)

        concat = io.BytesIO()

        for fn in [file1, file2]:
            with PathManager.open(fn, 'rb') as rawf:
                concat.write(rawf.read())

        with gzip.GzipFile(fileobj=io.BytesIO(concat.getvalue())) as f:
            file_content = bytes.decode(f.read())
            data = file_content.split('\n')[2:]

        create_fb_format(data, dpath)

        PathManager.rm(file1)
        PathManager.rm(file2)

        # Mark the data as built.
        build_data.mark_done(dpath, version)
Ejemplo n.º 5
0
    def setUp(self):
        self.datapath = ParlaiParser().parse_args([])['datapath']
        self.datapath = os.path.join(self.datapath, 'build_data_pyt_data')
        PathManager.mkdirs(self.datapath)

        for d in self.dest_filenames:
            # Removing files if they are already there b/c otherwise it won't try to download them again
            try:
                PathManager.rm(os.path.join(self.datapath, d))
            except OSError:
                pass
Ejemplo n.º 6
0
def build(opt):
    version = 'v0.9'
    dpath = os.path.join(opt['datapath'], 'VisDial-v0.9')

    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')

        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        for downloadable_file in RESOURCES:
            downloadable_file.download_file(dpath)

        print('processing unpacked files')
        # Use 1000 examples from training set as validation.
        json1 = os.path.join(
            dpath, RESOURCES[0].file_name.rsplit('.', 1)[0] + '.json')
        with PathManager.open(json1) as t_json:
            train_data = json.load(t_json)

        valid_data = train_data.copy()
        valid_data['data'] = train_data['data'].copy()
        valid_data['data']['dialogs'] = []

        # Use constant stride to pick examples.
        num_valid = 1000
        total = len(train_data['data']['dialogs'])
        step = total // (num_valid - 1)
        for i in range(total - 1, 0, -step)[:num_valid]:
            valid_data['data']['dialogs'].append(
                train_data['data']['dialogs'][i])
            del train_data['data']['dialogs'][i]

        train_json = json1.rsplit('.', 1)[0] + '_train.json'
        valid_json = json1.rsplit('.', 1)[0] + '_valid.json'
        with PathManager.open(train_json,
                              'w') as t_out, PathManager.open(valid_json,
                                                              'w') as v_out:
            json.dump(train_data, t_out)
            json.dump(valid_data, v_out)
        PathManager.rm(json1)

        # Use validation data as test.
        json2 = os.path.join(
            dpath, RESOURCES[1].file_name.rsplit('.', 1)[0] + '.json')
        test_json = json2.rsplit('.', 1)[0] + '_test.json'
        build_data.move(json2, test_json)

        # Mark the data as built.
        build_data.mark_done(dpath, version)
Ejemplo n.º 7
0
def _untar(path, fname, delete=True, flatten=False):
    """
    Unpack the given archive file to the same directory.

    :param str path:
        The folder containing the archive. Will contain the contents.

    :param str fname:
        The filename of the archive file.

    :param bool delete:
        If true, the archive will be deleted after extraction.
    """
    import tarfile

    logging.debug(f'unpacking {fname}')
    fullpath = os.path.join(path, fname)
    # very painfully manually extract files so that we can use PathManger.open
    # instead, lest we are using fb internal file services

    with tarfile.open(fileobj=PathManager.open(fullpath, 'rb')) as tf:
        for item in tf:
            item_name = item.name
            while item_name.startswith("./"):
                # internal file systems will actually create a literal "."
                # directory, so we gotta watch out for that
                item_name = item_name[2:]
            if flatten:
                # flatten the tar file if there are subdirectories
                fn = os.path.join(path, os.path.split(item_name)[-1])
            else:
                fn = os.path.join(path, item_name)
            logging.debug(f"Extracting to {fn}")
            if item.isdir():
                PathManager.mkdirs(fn)
            elif item.isfile():
                with PathManager.open(fn, 'wb') as wf, tf.extractfile(
                        item.name) as rf:
                    tarfile.copyfileobj(rf, wf)
            else:
                raise NotImplementedError(
                    "No support for symlinks etc. right now.")

    if delete:
        try:
            PathManager.rm(fullpath)
        except PermissionError:
            logging.error(
                f"Tried to delete {fullpath} but got a permission error. This "
                "is known to happen in Windows and is probably not fatal.")
Ejemplo n.º 8
0
def _untar(path, fname, delete=True):
    """
    Unpack the given archive file to the same directory.

    :param str path:
        The folder containing the archive. Will contain the contents.

    :param str fname:
        The filename of the archive file.

    :param bool delete:
        If true, the archive will be deleted after extraction.
    """
    import tarfile

    logging.debug(f'unpacking {fname}')
    fullpath = os.path.join(path, fname)
    # very painfully manually extract files so that we can use PathManger.open
    # instead, lest we are using fb internal file services

    with tarfile.open(fileobj=PathManager.open(fullpath, 'rb')) as tf:
        for item in tf:
            item_name = item.name
            while item_name.startswith("./"):
                # internal file systems will actually create a literal "."
                # directory, so we gotta watch out for that
                item_name = item_name[2:]
            fn = os.path.join(path, item_name)
            logging.debug(f"Extracting to {fn}")
            if item.isdir():
                PathManager.mkdirs(fn)
            elif item.isfile():
                with PathManager.open(fn, 'wb') as wf, tf.extractfile(
                        item.name) as rf:
                    tarfile.copyfileobj(rf, wf)
            else:
                raise NotImplementedError(
                    "No support for symlinks etc. right now.")

    if delete:
        PathManager.rm(fullpath)
Ejemplo n.º 9
0
    def test_set_model_file_without_dict_file(self):
        """
        Check that moving a model without moving the dictfile raises an error.
        """
        # Download model, move to a new location
        with testing_utils.tempdir() as datapath:
            try:
                # remove unittest models if there before
                shutil.rmtree(os.path.join(datapath, 'models/unittest'))
            except FileNotFoundError:
                pass

            zoo_path = 'zoo:unittest/seq2seq/model'
            model_path = modelzoo_path(datapath, zoo_path)
            PathManager.rm(model_path + '.dict')
            # Test that eval model fails
            with self.assertRaises(RuntimeError):
                testing_utils.eval_model(
                    dict(task='babi:task1k:1', model_file=model_path))
            try:
                # remove unittest models if there after
                shutil.rmtree(os.path.join(datapath, 'models/unittest'))
            except FileNotFoundError:
                pass
Ejemplo n.º 10
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'AmazonQA')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')

        if build_data.built(dpath):
            # an older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        for downloadable_file in RESOURCES:
            downloadable_file.download_file(dpath)
            new_filename = downloadable_file.file_name[:-3]
            print('[ unpacking data: ' + downloadable_file.file_name + ' ]')
            f = open(dpath + '/' + new_filename, 'w')
            for l in parse_gzip(dpath + '/' + downloadable_file.file_name):
                f.write(l + '\n')
            PathManager.rm(dpath + '/' + downloadable_file.file_name)

        # mark the data as built
        build_data.mark_done(dpath, version_string=version)
Ejemplo n.º 11
0
    def _download_images(self, opt: Opt):
        """
        Download available IGC images.
        """
        urls = []
        ids = []
        for dt in ['test', 'val']:
            df = os.path.join(self.get_data_path(opt), f'IGC_crowd_{dt}.csv')
            with PathManager.open(df, newline='\n') as csv_file:
                reader = csv.reader(csv_file, delimiter=',')
                fields = []
                for i, row in enumerate(reader):
                    if i == 0:
                        fields = row
                    else:
                        data = dict(zip(fields, row))
                        urls.append(data['url'])
                        ids.append(data['id'])
        PathManager.mkdirs(self.get_image_path(opt))
        # Make one blank image
        image = Image.new('RGB', (100, 100), color=0)
        image.save(os.path.join(self.get_image_path(opt), self.blank_image_id),
                   'JPEG')
        # Download the rest
        download_multiprocess(urls,
                              self.get_image_path(opt),
                              dest_filenames=ids)

        # Remove bad images
        for fp in os.listdir(self.get_image_path(opt)):
            img_path = os.path.join(self.get_image_path(opt), fp)
            if PathManager.exists(img_path):
                try:
                    Image.open(img_path).convert('RGB')
                except OSError:
                    PathManager.rm(img_path)
Ejemplo n.º 12
0
def download_and_process(file_url, mode, subreddit_names, st_time, output_dir):
    # download and pre-process original posts
    reddit_tmp_dir = pjoin(output_dir, 'reddit_tmp')
    f_name = pjoin(reddit_tmp_dir, file_url.split('/')[-1])
    tries_left = 4
    # open monthly dumps and download lines in posts
    while tries_left:
        try:
            print("downloading %s %2f" % (f_name, time() - st_time))
            subprocess.run(
                ['wget', '-P', reddit_tmp_dir, file_url], stdout=subprocess.PIPE
            )
            print("decompressing and filtering %s %2f" % (f_name, time() - st_time))
            if f_name.split('.')[-1] == 'xz':
                f = lzma.open(f_name, 'rt')
            elif f_name.split('.')[-1] == 'bz2':
                f = bz2.open(f_name, 'rt')
            elif f_name.split('.')[-1] == 'zst':
                fh = open(f_name, 'rb')
                dctx = zstd.ZstdDecompressor()
                stream_reader = dctx.stream_reader(fh)
                f = io.TextIOWrapper(stream_reader, encoding='utf-8')
            lines = dict([(name, []) for name in subreddit_names])
            for i, l in enumerate(f):
                if i % 1000000 == 0:
                    print(
                        "read %d lines, found %d"
                        % (i, sum([len(ls) for ls in lines.values()])),
                        time() - st_time,
                    )
                for name in subreddit_names:
                    subreddit_field = f'"subreddit":"{name}"'
                    if subreddit_field in l:
                        lines[name] += [l.strip()]
            if f_name.split('.')[-1] == 'zst':
                fh.close()
            else:
                f.close()
            PathManager.rm(f_name)
            tries_left = 0

        except EOFError:
            sleep(10)
            print(
                "failed reading file %s file, another %d tries" % (f_name, tries_left)
            )
            PathManager.rm(f_name)
            tries_left -= 1
    print("tokenizing and selecting %s %2f" % (f_name, time() - st_time))
    processed_items = dict([(name, []) for name in subreddit_names])
    if mode == 'submissions':
        key_list = ['id', 'score', 'url', 'title', 'selftext']
    else:
        key_list = ['id', 'link_id', 'parent_id', 'score', 'body']
    for name in subreddit_names:
        for line in lines[name]:
            reddit_dct = json.loads(line)
            if (
                reddit_dct.get('num_comments', 1) > 0
                and reddit_dct.get('score', 0)
                and reddit_dct.get('score', 0) >= 2
                and (mode == 'submissions' or valid_comment(reddit_dct))
            ):
                reddit_res = {}
                for k in key_list:
                    if k in ['title', 'selftext', 'body']:
                        if reddit_dct[k].lower() in ['[removed]', '[deleted]']:
                            reddit_dct[k] = ''
                        txt, url_list = word_url_tokenize(reddit_dct[k])
                        reddit_res[k] = (' '.join(txt.split()), url_list)
                    else:
                        reddit_res[k] = reddit_dct[k]
                processed_items[name] += [reddit_res]
    print("Total found %d" % (len(processed_items)), time() - st_time)
    return processed_items