Esempio n. 1
0
    def create_vec_maps(self,txts:dict=None,cats:dict=None):
        """ Maps text and categories to their vector representation.

        :param txts:
        :param cats:
        :return:
        """
        logger.debug(join(self.dataset_dir,self.dataset_name,self.dataset_name + "_txts2vec_map.pkl"))
        if isfile(join(self.dataset_dir,self.dataset_name,self.dataset_name + "_txts2vec_map.pkl"))\
                and isfile(join(self.dataset_dir,self.dataset_name,self.dataset_name + "_cats2vec_map.pkl")):
            logger.info("Loading pre-processed mappings from: [{}] and [{}]"
                        .format(join(self.dataset_dir,self.dataset_name,self.dataset_name + "_txts2vec_map.pkl"),
                                join(self.dataset_dir,self.dataset_name,self.dataset_name + "_cat2vec_map.pkl")))
            txts2vec_map = File_Util.load_pickle(self.dataset_name + "_txts2vec_map",
                                                 filepath=join(self.dataset_dir,self.dataset_name))
            cats2vec_map = File_Util.load_pickle(self.dataset_name + "_cats2vec_map",
                                                 filepath=join(self.dataset_dir,self.dataset_name))
        else:
            if txts is None or cats is None:
                txts,_,_,cats = self.load_raw_data(load_type='all',return_values=True)
            ## Generate txts2vec_map and cats2vec_map
            logger.info("Generating pre-processed mappings.")
            txts2vec_map = self.txt_process.gen_sample2vec_map(txts=txts)
            catid2cattxt = File_Util.inverse_dict_elm(cats)
            cats2vec_map = self.txt_process.gen_cats2vec_map(cats=catid2cattxt)

            logger.info("Saving pre-processed mappings to: [{}] and [{}]"
                        .format(join(self.dataset_dir,self.dataset_name,self.dataset_name + "_txts2vec_map.pkl"),
                                join(self.dataset_dir,self.dataset_name,self.dataset_name + "_cat2vec_map.pkl")))
            File_Util.save_pickle(txts2vec_map,self.dataset_name + "_txts2vec_map",
                                  filepath=join(self.dataset_dir,self.dataset_name))
            File_Util.save_pickle(cats2vec_map,self.dataset_name + "_cats2vec_map",
                                  filepath=join(self.dataset_dir,self.dataset_name))
        return txts2vec_map,cats2vec_map
Esempio n. 2
0
    def split_data(self,
                   txts: OrderedDict,
                   classes: OrderedDict,
                   categories: OrderedDict,
                   test_split: int = config["data"]["test_split"],
                   val_split: int = config["data"]["val_split"]):
        """ Splits input data into train, val and test.

        :return:
        :param categories:
        :param classes:
        :param txts:
        :param val_split: Validation split size.
        :param test_split: Test split size.
        :return:
        """
        logger.info("Total number of samples: [{}]".format(len(classes)))
        sample2cats_train,sample2cats_test,txts_train,txts_test =\
            File_Util.split_dict(classes,txts,
                                 batch_size=int(len(classes) * test_split))
        logger.info("Test count: [{}]. Remaining count: [{}]".format(
            len(sample2cats_test), len(sample2cats_train)))

        sample2cats_train,sample2cats_val,txts_train,txts_val =\
            File_Util.split_dict(sample2cats_train,txts_train,
                                 batch_size=int(len(txts_train) * val_split))
        logger.info("Validation count: [{}]. Train count: [{}]".format(
            len(sample2cats_val), len(sample2cats_train)))

        if isfile(
                join(self.dataset_dir,
                     self.dataset_name + "_catid2cattxt_map.json")):
            catid2cattxt_map = File_Util.load_json(self.dataset_name +
                                                   "_catid2cattxt_map",
                                                   filepath=self.dataset_dir)
            # Integer keys are converted to string when saving as JSON. Converting back to integer.
            catid2cattxt_map_int = OrderedDict()
            for k, v in catid2cattxt_map.items():
                catid2cattxt_map_int[int(k)] = v
            catid2cattxt_map = catid2cattxt_map_int
        else:
            logger.info("Generating inverted categories.")
            catid2cattxt_map = File_Util.inverse_dict_elm(categories)

        logger.info("Creating train categories.")
        cats_train = OrderedDict()
        for k, v in sample2cats_train.items():
            for cat_id in v:
                if cat_id not in cats_train:
                    cats_train[cat_id] = catid2cattxt_map[cat_id]
        cats_train = cats_train

        logger.info("Creating validation categories.")
        cats_val = OrderedDict()
        for k, v in sample2cats_val.items():
            for cat_id in v:
                if cat_id not in cats_val:
                    cats_val[cat_id] = catid2cattxt_map[cat_id]
        cats_val = cats_val

        logger.info("Creating test categories.")
        cats_test = OrderedDict()
        for k, v in sample2cats_test.items():
            for cat_id in v:
                if cat_id not in cats_test:
                    cats_test[cat_id] = catid2cattxt_map[cat_id]
        cats_test = cats_test
        return txts_train, sample2cats_train, cats_train, txts_val, sample2cats_val, cats_val, txts_test, sample2cats_test, cats_test, catid2cattxt_map