def get_lexicon(self):
        """
        Prepares the lexicon, creating it if needed and loads it.
        :return: The loaded lexicon.
        """
        self.logger.info("Preparing lexicon...")
        current_dir = Path()
        dir_path = current_dir / "data" / "break_data" / "lexicon_by_logical"
        file_name = "lexicon"

        if self.domain_split:
            file_name += "_domain_split"
        elif self.length_split:
            file_name += "_length_split"
        file_name += ".pkl"

        if not (dir_path / file_name).is_file():
            self.create_matching_lexicon(dir_path, file_name)
        data = load_obj(dir_path, file_name)

        # TODO delete this?
        # for type in data:
        #     for ex in data[type]:
        #         data[type][ex] = ast.literal_eval(data[type][ex])
        self.logger.info("Lexicon ready.")
        return data
Beispiel #2
0
def load_vocab(dir_path, file_name):
    """
    Loads a vocabulary from a file.
    :param dir_path: The path of the directory.
    :param file_name: The name of the vocab file.
    :return: The loaded Vocab.
    """
    properties = load_obj(dir_path, file_name)
    vocab = Vocab(properties['counter'], specials=properties['specials'])

    return vocab
    def load_domain_split_dataset(self, data_dir, logger=None):
        """
        Loads break dataset with domain split. Train - on text. val + test - on DB + images
        :param data_dir:    The path of the directory where the preprocessed dataset should be saved to or loaded from.
        :param logger:      A logger for logging events.
        :return:            The loaded dataset.
        """
        current_dir = Path()
        dir_path = current_dir / "data" / "break_data" / "preprocessed"
        file_name = "dataset_preprocessed_domain_split.pkl"
        if not (dir_path / file_name).is_file():
            if logger:
                logger.info('Creating domain split dataset...')
            text_domain_dataset_prefixes = ('COMQA', 'CWQ', 'DROP', 'HOTP')
            image_domain_dataset_prefixes = ('CLEVR', 'NLVR2')
            DB_domain_dataset_prefixes = ('ACADEMIC', 'ATIS', 'GEO', 'SPIDER')
            image_plus_DB = image_domain_dataset_prefixes + DB_domain_dataset_prefixes
            train_filtererd = pd.DataFrame()
            validation_filtererd = pd.DataFrame()
            test_filtererd = pd.DataFrame()

            for i, example in enumerate(self.dataset_logical['train']):
                if example['question_id'].startswith(
                        text_domain_dataset_prefixes):
                    train_filtererd = train_filtererd.append(example,
                                                             ignore_index=True)
            for i, example in enumerate(self.dataset_logical['validation']):
                if example['question_id'].startswith(image_plus_DB):
                    validation_filtererd = validation_filtererd.append(
                        example, ignore_index=True)
            for i, example in enumerate(self.dataset_logical['test']):
                if example['question_id'].startswith(image_plus_DB):
                    test_filtererd = test_filtererd.append(example,
                                                           ignore_index=True)

            # TODO delete this?
            # train_dataset = self.dataset_logical['train'].filter(
            #     lambda example: example['question_id'].startswith(text_domain_dataset_prefixes))
            # validation_dataset = self.dataset_logical['validation'].filter(
            #     lambda example: example['question_id'].startswith(image_plus_DB))
            # test_dataset = self.dataset_logical['test'].filter(
            #     lambda example: example['question_id'].startswith(image_plus_DB))
            # train_filtererd_ds = Dataset.from_pandas(train_filtererd)
            to_save = {
                'train': Dataset.from_pandas(train_filtererd),
                'validation': Dataset.from_pandas(validation_filtererd),
                'test': Dataset.from_pandas(test_filtererd)
            }
            save_obj(dir_path, to_save, file_name)

        dataset = load_obj(dir_path, file_name)
        return dataset
    def get_programs(self):
        """
        Loads the programs from a file.
        :return: The loaded programs.
        """
        self.logger.info("Preparing programs...")
        current_dir = Path()
        dir_path = current_dir / "data" / "break_data" / "programs"

        file_name = "programs_" + self.dataset_split + ".pkl"
        if not (dir_path / file_name).is_file():
            self.create_matching_programs(dir_path, file_name)
        data = load_obj(dir_path, file_name)

        self.logger.info("Programs ready.")
        return data
    def load_length_split_dataset(self, data_dir, logger=None):
        """
        Loads break dataset with length split based on number of operators.
        Train - <= 4 steps.
        val + test - on DB + images
        :param data_dir: The path of the directory where the preprocessed dataset should be saved to or loaded from.
        :param logger: A logger for logging events.
        :return: The loaded dataset.
        """
        # TODO datadir required in signature?
        current_dir = Path()
        dir_path = current_dir / "data" / "break_data" / "preprocessed"
        file_name = "dataset_preprocessed_length_split.pkl"

        if not (dir_path / file_name).is_file():
            if logger:
                logger.info('Creating length split dataset...')
            threshold_amount_ops = 4

            train_filtererd = pd.DataFrame()
            validation_filtererd = pd.DataFrame()
            test_filtererd = pd.DataFrame()

            for i, example in enumerate(self.dataset_logical['train']):
                if example['operators'].count(',') < threshold_amount_ops:
                    train_filtererd = train_filtererd.append(example,
                                                             ignore_index=True)
            for i, example in enumerate(self.dataset_logical['validation']):
                if example['operators'].count(',') >= threshold_amount_ops:
                    validation_filtererd = validation_filtererd.append(
                        example, ignore_index=True)
            for i, example in enumerate(self.dataset_logical['test']):
                if example['operators'].count(',') >= threshold_amount_ops:
                    test_filtererd = test_filtererd.append(example,
                                                           ignore_index=True)

            to_save = {
                'train': Dataset.from_pandas(train_filtererd),
                'validation': Dataset.from_pandas(validation_filtererd),
                'test': Dataset.from_pandas(test_filtererd)
            }
            save_obj(dir_path, to_save, file_name)

        dataset = load_obj(dir_path, file_name)
        return dataset
    def load_dataset(data_dir, dataset_split, logger=None):
        """
        loads the requested Break dataset from Hugging Face.
        :param data_dir: The path of the directory where the preprocessed dataset should be saved to or loaded from.
        :param dataset_split: The type of dataset to download from HF.
        :param logger: A logger for logging events.
        :return: The loaded dataset.
        """
        current_dir = Path()
        dir_path = current_dir / "data" / "break_data" / "preprocessed"
        file_name = "dataset_preprocessed_" + dataset_split + ".pkl"
        if not (dir_path / file_name).is_file():
            # Download and preprocess the BREAK dataset (logical form and lexicon), and save the preprocessed data.
            if logger:
                logger.info('Downloading and preparing datasets...')
            dataset_logical = load_dataset('break_data',
                                           dataset_split,
                                           cache_dir=data_dir)
            save_obj(dir_path, dataset_logical, file_name)

        # Load the saved preprocessed data.
        dataset = load_obj(dir_path, file_name)
        return dataset
Beispiel #7
0
        saver_pose_seg_depth.restore(sess, restore_dir)
        print('restore pose_seg_depth succeed')
    else:
        print('Restore pose_seg_depth failed')
        raise SystemExit

    restore_dir = tf.train.latest_checkpoint(params.normal_dir)
    if restore_dir:
        saver_normal.restore(sess, restore_dir)
        print('restore normal succeed')
    else:
        print('Restore normal failed')
        raise SystemExit

    # Initialize datalist
    meanstd = util.load_obj(params.meanRgb_dir)
    dataset = glob.glob(params.test_dir + '/*.jpg')
    img_batch = np.zeros((params.batch_size, 256, 256, 3), dtype=np.float32)
    mask_batch = np.zeros((params.batch_size, 256, 256), dtype=np.bool)
    segmap = np.zeros((params.batch_size, 256, 256), dtype=np.int32)
    testimg = np.zeros((256, 256, 3), dtype=np.float32)

    images = os.listdir(params.test_dir)
    for each in images:
        i = 0
        imgname = os.path.join(params.test_dir, each)

        try:
            im = Image.open(imgname)
            # do stuff
        except IOError: