Exemple #1
0
    def _get_data(self):
        # useful constants
        # all of these colors are bolded
        RESET = '\033[0m'
        RED = '\033[1;91m'
        YELLOW = '\033[1;93m'
        GREEN = '\033[1;92m'
        BLUE = '\033[1;96m'
        CYAN = '\033[1;94m'
        MAGENTA = '\033[1;95m'

        # only use colors if we're outputting to a terminal
        USE_COLORS = _sys.stdout.isatty()
        if not USE_COLORS:
            RESET = RED = YELLOW = GREEN = BLUE = CYAN = MAGENTA = ''

        # generate the rainbow stars
        rainbow = [RED, YELLOW, GREEN, CYAN, BLUE, MAGENTA]
        size = 78 // len(rainbow)
        stars = ''.join([color + '*' * size for color in rainbow])
        stars += RESET

        if not os.path.exists(self.data_path):
            PathManager.mkdirs(self.data_path)
        if not PathManager.exists(os.path.join(self.data_path, 'train.csv')):
            raise RuntimeError(
                f'\n\n{stars}\nThis data must be downloaded from {self.DATA_SOURCE}'
                '\nIt cannot be automatically downloaded, as one must agree to '
                'the competition rules outlined on the website before '
                'gaining access to the data.\n\n'
                'Once downloaded, please put the data in the following '
                f'directory: \n{self.data_path}\n{stars}')
Exemple #2
0
 def _check_parent_dir_exits(datapath):
     parent_dir = os.path.dirname(datapath)
     if not parent_dir or PathManager.exists(parent_dir):
         return
     logging.info(
         f'Parent directory ({parent_dir}) did not exist and was created.')
     PathManager.mkdirs(parent_dir)
Exemple #3
0
def make_dir(path):
    """
    Make the directory and any nonexistent parent directories (`mkdir -p`).
    """
    # the current working directory is a fine path
    if path != '':
        PathManager.mkdirs(path)
Exemple #4
0
def _unzip(path, fname, delete=True):
    """
    Unpack the given zip file to the same directory.

    :param str path:
        The folder containing the archive. Will contain the contents.

    :param str fname:
        The filename of the archive file.

    :param bool delete:
        If true, the archive will be deleted after extraction.
    """
    import zipfile

    logging.debug(f'unpacking {fname}')
    fullpath = os.path.join(path, fname)
    with zipfile.ZipFile(PathManager.open(fullpath, 'rb'), 'r') as zf:
        for member in zf.namelist():
            outpath = os.path.join(path, member)
            if zf.getinfo(member).is_dir():
                logging.debug(f"Making directory {outpath}")
                PathManager.mkdirs(outpath)
                continue
            logging.debug(f"Extracting to {outpath}")
            with zf.open(member, 'r') as inf, PathManager.open(outpath,
                                                               'wb') as outf:
                shutil.copyfileobj(inf, outf)
    if delete:
        try:
            PathManager.rm(fullpath)
        except PermissionError:
            logging.error(
                f"Tried to delete {fullpath} but got a permission error. This "
                "is known to happen in Windows and is probably not fatal.")
Exemple #5
0
    def _setup_test_data(self, opt):
        datapath = os.path.join(opt['datapath'], 'ImageTeacher')
        imagepath = os.path.join(datapath, 'images')
        PathManager.mkdirs(imagepath)

        self.image_features_path = os.path.join(
            datapath, f'{opt["image_mode"]}_image_features')

        # Create fake images and features
        imgs = [f'img_{i}' for i in range(10)]
        for i, img in enumerate(imgs):
            image = Image.new('RGB', (16, 16), color=i)
            with PathManager.open(os.path.join(imagepath, f'{img}.jpg'),
                                  'wb') as fp:
                image.save(fp, 'JPEG')

        # write out fake data
        for dt in ['train', 'valid', 'test']:
            random.seed(42)
            data = [{
                'image_id': img,
                'text': string.ascii_uppercase[i]
            } for i, img in enumerate(imgs)]
            with PathManager.open(os.path.join(datapath, f'{dt}.json'),
                                  'w') as f:
                json.dump(data, f)
Exemple #6
0
def _unzip(path, fname, delete=True):
    """
    Unpack the given zip file to the same directory.

    :param str path:
        The folder containing the archive. Will contain the contents.

    :param str fname:
        The filename of the archive file.

    :param bool delete:
        If true, the archive will be deleted after extraction.
    """
    import zipfile

    logging.debug(f'unpacking {fname}')
    fullpath = os.path.join(path, fname)
    with zipfile.ZipFile(PathManager.open(fullpath, 'rb'), 'r') as zf:
        for member in zf.namelist():
            outpath = os.path.join(path, member)
            if zf.getinfo(member).is_dir():
                logging.debug(f"Making directory {outpath}")
                PathManager.mkdirs(outpath)
                continue
            logging.debug(f"Extracting to {outpath}")
            with zf.open(member, 'r') as inf, PathManager.open(outpath,
                                                               'wb') as outf:
                shutil.copyfileobj(inf, outf)
    if delete:
        PathManager.rm(fullpath)
    def setUp(self):
        self.datapath = ParlaiParser().parse_args([])['datapath']
        self.datapath = os.path.join(self.datapath, 'build_data_pyt_data')
        PathManager.mkdirs(self.datapath)

        for d in self.dest_filenames:
            # Removing files if they are already there b/c otherwise it won't try to download them again
            try:
                PathManager.rm(os.path.join(self.datapath, d))
            except OSError:
                pass
Exemple #8
0
    def get_image_features_path(self, task, image_model_name, dt):
        """
        Override so that subclasses can see same image features.
        """
        # In default implementation, self.data_path already has task name added
        image_features_path = os.path.join(self.data_path, 'image_features')

        if not os.path.isdir(image_features_path):
            PathManager.mkdirs(image_features_path)

        return os.path.join(image_features_path,
                            f'{image_model_name}_{dt}_features_dict')
Exemple #9
0
    def __init__(self, opt: Opt):
        try:
            # tensorboard is a very expensive thing to import. Wait until the
            # last second to import it.
            from tensorboardX import SummaryWriter
        except ImportError:
            raise ImportError('Please run `pip install tensorboard tensorboardX`.')

        tbpath = opt['model_file'] + '.tensorboard'
        logging.debug(f'Saving tensorboard logs to: {tbpath}')
        if not PathManager.exists(tbpath):
            PathManager.mkdirs(tbpath)
        self.writer = SummaryWriter(tbpath, comment=json.dumps(opt))
Exemple #10
0
def _untar(path, fname, delete=True, flatten=False):
    """
    Unpack the given archive file to the same directory.

    :param str path:
        The folder containing the archive. Will contain the contents.

    :param str fname:
        The filename of the archive file.

    :param bool delete:
        If true, the archive will be deleted after extraction.
    """
    import tarfile

    logging.debug(f'unpacking {fname}')
    fullpath = os.path.join(path, fname)
    # very painfully manually extract files so that we can use PathManger.open
    # instead, lest we are using fb internal file services

    with tarfile.open(fileobj=PathManager.open(fullpath, 'rb')) as tf:
        for item in tf:
            item_name = item.name
            while item_name.startswith("./"):
                # internal file systems will actually create a literal "."
                # directory, so we gotta watch out for that
                item_name = item_name[2:]
            if flatten:
                # flatten the tar file if there are subdirectories
                fn = os.path.join(path, os.path.split(item_name)[-1])
            else:
                fn = os.path.join(path, item_name)
            logging.debug(f"Extracting to {fn}")
            if item.isdir():
                PathManager.mkdirs(fn)
            elif item.isfile():
                with PathManager.open(fn, 'wb') as wf, tf.extractfile(
                        item.name) as rf:
                    tarfile.copyfileobj(rf, wf)
            else:
                raise NotImplementedError(
                    "No support for symlinks etc. right now.")

    if delete:
        try:
            PathManager.rm(fullpath)
        except PermissionError:
            logging.error(
                f"Tried to delete {fullpath} but got a permission error. This "
                "is known to happen in Windows and is probably not fatal.")
Exemple #11
0
    def finalize(self,
                 frequencies: Dict[str, int],
                 num_symbols: int = 30000,
                 minfreq: int = 2) -> bool:
        """
        Build the codecs.

        :param frequencies:
            dictionary of (token: frequency) pairs
        :param num_symbols:
            Number of BPE symbols. Recommend 30000-40000.  If <= 0, default
            30000 will be used.
        :param minfreq:
            Minimum frequency of a token before forced BPE decomposition. If <=
            0 will use subword-nmt default of 2.

        :return did_finalize:
            return whether codecs are finalized this call.
        """
        if hasattr(self, 'bpe'):
            # we already finalized the codecs
            return False

        logging.debug(f'Saving bpe codecs to {self.codecs}')

        dictionary = ("{} {}".format(k, v) for k, v in frequencies.items())

        if num_symbols <= 0:
            num_symbols = 30000
        if minfreq <= 0:
            minfreq = 2

        codec_dir, _ = os.path.split(self.codecs)
        PathManager.mkdirs(codec_dir)
        with PathManager.open(self.codecs, 'w', encoding='utf-8') as outstream:
            learn_bpe.learn_bpe(
                dictionary,
                outstream,
                num_symbols=num_symbols,
                min_frequency=minfreq,
                is_dict=True,
            )

        self._load_from_codecs()
        return True
Exemple #12
0
def download_images(opt, task='personality_captions'):
    dpath = os.path.join(opt['datapath'], task)
    image_path = os.path.join(opt['datapath'], 'yfcc_images')
    version = '1.0'
    response = input(
        'Please confirm that you have obtained permission '
        'to work with the YFCC100m dataset, as outlined by the steps '
        'listed at '
        'https://multimediacommons.wordpress.com/yfcc100m-core-dataset/ [Y/y]: '
    )
    if response.lower() != 'y':
        raise RuntimeError(
            'In order to use the images from this dataset, '
            'you must obtain permission from the webpage above.')
    response = input(
        'NOTE: This script will download each image individually from the '
        's3 server on which the images are hosted. This will take a *very '
        'long* time. Are you sure you would like to continue? [Y/y]: ')
    if response.lower() != 'y':
        raise RuntimeError('If you have access to the images, please specify '
                           'the path to the folder via the `--yfcc-path` '
                           'command line argument.')
    image_prefix = 'https://multimedia-commons.s3-us-west-2.amazonaws.com/data/images'
    hashes = []
    dts = ['train', 'val', 'test']
    if task == 'image_chat':
        dts[1] = 'valid'
    for dt in dts:
        with PathManager.open(os.path.join(dpath, '{}.json'.format(dt))) as f:
            data = json.load(f)
            hashes += [d['image_hash'] for d in data]
    PathManager.mkdirs(image_path)

    print('[downloading images to {}]'.format(image_path))
    image_urls = [
        f"{image_prefix}/{p_hash[:3]}/{p_hash[3:6]}/{p_hash}.jpg"
        for p_hash in hashes
    ]
    download_multiprocess(image_urls,
                          image_path,
                          dest_filenames=[f"{h}.jpg" for h in hashes])
    build_data.mark_done(image_path, version)
Exemple #13
0
def _untar(path, fname, delete=True):
    """
    Unpack the given archive file to the same directory.

    :param str path:
        The folder containing the archive. Will contain the contents.

    :param str fname:
        The filename of the archive file.

    :param bool delete:
        If true, the archive will be deleted after extraction.
    """
    import tarfile

    logging.debug(f'unpacking {fname}')
    fullpath = os.path.join(path, fname)
    # very painfully manually extract files so that we can use PathManger.open
    # instead, lest we are using fb internal file services

    with tarfile.open(fileobj=PathManager.open(fullpath, 'rb')) as tf:
        for item in tf:
            item_name = item.name
            while item_name.startswith("./"):
                # internal file systems will actually create a literal "."
                # directory, so we gotta watch out for that
                item_name = item_name[2:]
            fn = os.path.join(path, item_name)
            logging.debug(f"Extracting to {fn}")
            if item.isdir():
                PathManager.mkdirs(fn)
            elif item.isfile():
                with PathManager.open(fn, 'wb') as wf, tf.extractfile(
                        item.name) as rf:
                    tarfile.copyfileobj(rf, wf)
            else:
                raise NotImplementedError(
                    "No support for symlinks etc. right now.")

    if delete:
        PathManager.rm(fullpath)
Exemple #14
0
    def _test_display_output(self, image_mode):
        """
        Test display data output with given image_mode.
        """
        with testing_utils.tempdir() as tmpdir:
            data_path = tmpdir
            PathManager.mkdirs(os.path.join(data_path, 'ImageTeacher'))

            opt = {
                'task': 'integration_tests:ImageTeacher',
                'datapath': data_path,
                'image_mode': image_mode,
                'display_verbose': True,
            }
            output = testing_utils.display_data(opt)
            train_labels = re.findall(r"\[labels\].*\n", output[0])
            valid_labels = re.findall(r"\[eval_labels\].*\n", output[1])
            test_labels = re.findall(r"\[eval_labels\].*\n", output[2])

            for i, lbls in enumerate([train_labels, valid_labels, test_labels]):
                self.assertGreater(len(lbls), 0, 'DisplayData failed')
                self.assertEqual(len(lbls), len(set(lbls)), output[i])
    def get_app_token(self):
        """
        Find and return an app access token.
        """
        if not self.opt.get('force_page_token'):
            if not os.path.exists(os.path.expanduser('~/.parlai/')):
                PathManager.mkdirs(os.path.expanduser('~/.parlai/'))
            access_token_file_path = '~/.parlai/messenger_token'
            expanded_file_path = os.path.expanduser(access_token_file_path)
            if os.path.exists(expanded_file_path):
                with open(expanded_file_path, 'r') as access_token_file:
                    return access_token_file.read()

        token = input(
            'Enter your page\'s access token from the developer page at'
            'https://developers.facebook.com/apps/<YOUR APP ID>'
            '/messenger/settings/ to continue setup:')
        access_token_file_path = '~/.parlai/messenger_token'
        expanded_file_path = os.path.expanduser(access_token_file_path)
        with open(expanded_file_path, 'w+') as access_token_file:
            access_token_file.write(token)
        return token
Exemple #16
0
    def get_app_token(self):
        """
        Find and return an app access token.
        """
        if not self.opt.get('force_telegram_bot_token'):
            if not os.path.exists(os.path.expanduser("~/.parlai/")):
                PathManager.mkdirs(os.path.expanduser("~/.parlai/"))
            access_token_file = '~/.parlai/telegram_token'
            expanded_file_path = os.path.expanduser(access_token_file)
            if os.path.exists(expanded_file_path):
                print(f"Token was read from: {expanded_file_path}")
                with open(expanded_file_path, 'r') as access_token_file:
                    return access_token_file.read()

        token = input(
            'Enter your bot\'s access token from the BotFather page at '
            'https://telegram.me/botfather/ to continue setup: '
        )
        access_token_file_path = '~/.parlai/telegram_token'
        expanded_file_path = os.path.expanduser(access_token_file_path)
        with open(expanded_file_path, 'w') as access_token_file:
            access_token_file.write(token)
        return token
Exemple #17
0
    def _download_images(self, opt: Opt):
        """
        Download available IGC images.
        """
        urls = []
        ids = []
        for dt in ['test', 'val']:
            df = os.path.join(self.get_data_path(opt), f'IGC_crowd_{dt}.csv')
            with PathManager.open(df, newline='\n') as csv_file:
                reader = csv.reader(csv_file, delimiter=',')
                fields = []
                for i, row in enumerate(reader):
                    if i == 0:
                        fields = row
                    else:
                        data = dict(zip(fields, row))
                        urls.append(data['url'])
                        ids.append(data['id'])
        PathManager.mkdirs(self.get_image_path(opt))
        # Make one blank image
        image = Image.new('RGB', (100, 100), color=0)
        image.save(os.path.join(self.get_image_path(opt), self.blank_image_id),
                   'JPEG')
        # Download the rest
        download_multiprocess(urls,
                              self.get_image_path(opt),
                              dest_filenames=ids)

        # Remove bad images
        for fp in os.listdir(self.get_image_path(opt)):
            img_path = os.path.join(self.get_image_path(opt), fp)
            if PathManager.exists(img_path):
                try:
                    Image.open(img_path).convert('RGB')
                except OSError:
                    PathManager.rm(img_path)
Exemple #18
0
    def _check_data_downloaded(self, opt):
        # Checks whether the data is downloaded properly
        # Also checks whether data is built, and builds it if so
        RESET = '\033[0m'
        RED = '\033[1;91m'
        YELLOW = '\033[1;93m'
        GREEN = '\033[1;92m'
        BLUE = '\033[1;96m'
        CYAN = '\033[1;94m'
        MAGENTA = '\033[1;95m'

        # only use colors if we're outputting to a terminal
        USE_COLORS = _sys.stdout.isatty()
        if not USE_COLORS:
            RESET = RED = YELLOW = GREEN = BLUE = CYAN = MAGENTA = ''

        # generate the rainbow stars
        rainbow = [RED, YELLOW, GREEN, CYAN, BLUE, MAGENTA]
        size = 78 // len(rainbow)
        stars = ''.join([color + '*' * size for color in rainbow])
        stars += RESET

        self.data_path = os.path.join(opt['datapath'], 'md_gender', 'yelp')
        if not os.path.exists(self.data_path):
            PathManager.mkdirs(self.data_path)
        if not PathManager.exists(
                os.path.join(self.data_path, 'valid.fader.with_cat.40000')):
            raise RuntimeError(
                f'\n\n{stars}\nThis data must be downloaded following instructions in '
                'the README here:'
                '<https://github.com/facebookresearch/MultipleAttributeTextRewriting/blob/master/data/README.md>. '
                '\nIt cannot be automatically downloaded, as one must agree to '
                'the terms outlined on the website before gaining access to the data.\n\n'
                'Once downloaded, please put the data in the following '
                f'directory: \n{self.data_path}\n{stars}')
        elif not PathManager.exists(
                os.path.join(self.data_path, 'classtrain.txt')):
            logging.info('[ Building data ... ]')
            # build train
            with open(os.path.join(self.data_path, 'classtrain.txt'),
                      'w') as f:
                for fle_num in [4000, 6000, 8000]:
                    train_fle = f'train.fader.with_cat.{fle_num}'
                    with open(os.path.join(self.data_path, train_fle)) as g:
                        lines = g.readlines()
                        for line in lines:
                            tabs = line.split('\t')
                            text = tabs[0]
                            gend = tabs[1]
                            if gend == '0':
                                f.write(f'male\t{text}\n')
                            elif gend == '1':
                                f.write(f'female\t{text}\n')

            # build valid and test
            for pair in [('dev', 'valid'), ('test', 'test')]:
                with open(
                        os.path.join(self.data_path,
                                     f'female_only.{pair[0]}.en'),
                        'w') as fem_val:
                    with open(
                            os.path.join(self.data_path,
                                         f'male_only.{pair[0]}.en'),
                            'w') as masc_val:
                        for fle_num in [4000, 6000, 8000]:
                            valid_fle = f'{pair[1]}.fader.with_cat.{fle_num}'
                            with open(os.path.join(self.data_path, valid_fle),
                                      'r') as g:
                                lines = g.readlines()
                                for line in lines:
                                    tabs = line.split('\t')
                                    text = tabs[0]
                                    gend = tabs[1]
                                    if gend == '0':
                                        masc_val.write(f'{text}\n')
                                    elif gend == '1':
                                        fem_val.write(f'{text}\n')