def run(self):
     init_logger(LOGGER_NAME)
     logger.info("Preparing `%s`." % self.inp_file_path)
     if GlobalConfig().dataset == 'amazon':
         iter = read_amazon_data(self.inp_file_path, replace_xml=True)
     else:
         iter = read_yelp_data(self.inp_file_path)
     for group_id, dus in iter:
         full_file_name = "%s.csv" % group_id
         out_file_path = comb_paths(self.act_out_dir_path, full_file_name)
         write_group_to_csv(out_file_path, dus, sep='\t')
 def __init__(self, *args, **kwargs):
     super(PrepareFile, self).__init__(*args, **kwargs)
     self.act_out_dir_path = get_act_out_dir_path(
         GlobalConfig().out_dir_path, self.inp_file_path, FOLDER_NAME)
Esempio n. 3
0
    def run(self):
        log_file_path = comb_paths(
            GlobalConfig().out_dir_path, "logs", FOLDER_NAME,
            "%s.txt" % get_file_name(self.inp_file_path))
        init_logger(LOGGER_NAME, output_path=log_file_path)

        init_unit_count = 0
        init_group_count = 0

        group_id_to_units = {}
        group_unit_counts = []

        # 1. reading data and filtering out short/long reviews and
        # unpopular groups
        inp_dir = self.input()[0]
        logger.info("Subsampling `%s`." % inp_dir.path)
        for inp_file_path in iter_file_paths(inp_dir.path):
            group_id = get_file_name(inp_file_path)
            group_units = []
            init_group_count += 1
            for data_unit in read_csv_file(inp_file_path, sep='\t'):
                init_unit_count += 1
                rev_text = data_unit[OutputFields.REV_TEXT].split()

                # removing too short and long reviews
                if len(rev_text) < self.min_rev_len or \
                        len(rev_text) > self.max_rev_len:
                    continue

                group_units.append(data_unit)

            # removing unpopular groups
            if len(group_units) < self.min_revs:
                continue

            group_id_to_units[group_id] = group_units
            group_unit_counts.append(len(group_units))

        if not len(group_id_to_units):
            raise ValueError("No groups to proceed.")

        # 2. filtering by percentile
        perc = np.percentile(group_unit_counts, self.percentile)

        # removing above a kth percentile groups
        subs_group_id_to_units = {}
        subs_units_count = 0
        subs_units_max_count = 0

        for group_id, group_units in group_id_to_units.items():
            if len(group_units) < perc or perc == 1.:

                # making sure that the subsampled number of reviews does not
                # exceed a threshold unless most of the businesses only have
                # one review
                if self.max_total_revs is not None \
                        and (subs_units_count + len(
                    group_units)) > self.max_total_revs:
                    break

                subs_units_count += len(group_units)
                subs_units_max_count = max(subs_units_max_count,
                                           len(group_units))

                subs_group_id_to_units[group_id] = group_units

        if subs_units_count == 0:
            raise ValueError("All units were subsampled out. "
                             "Please adjust the parameters.")

        # 3. dumping to files
        write_groups_to_csv(self.act_out_dir_path,
                            subs_group_id_to_units,
                            sep='\t')

        # 4. logging statistics
        stats = OrderedDict()
        stats['General'] = OrderedDict()
        stats['General']['inp dir'] = inp_dir.path

        stats['Initial'] = OrderedDict()
        stats['Initial']['group count'] = init_group_count
        stats['Initial']['unit count'] = init_unit_count

        stats['After Filtering'] = OrderedDict()
        stats['After Filtering']['group count'] = len(group_id_to_units)
        stats['After Filtering']['unit count'] = np.sum(group_unit_counts)
        stats['After Filtering']['percentile count'] = perc

        stats['After Subsampling'] = OrderedDict()
        stats['After Subsampling']['group count'] = len(subs_group_id_to_units)
        stats['After Subsampling']['unit count'] = subs_units_count
        stats['After Subsampling'][
            'max units per group'] = subs_units_max_count

        stats_str = format_stats(stats)

        logger.info(stats_str)
Esempio n. 4
0
    def run(self):

        log_file_path = comb_paths(GlobalConfig().out_dir_path, "logs",
                                   "partitioning.txt")
        init_logger(LOGGER_NAME, output_path=log_file_path)

        excluded_group_count = 0
        list_group_units = []

        # tracking duplicate groups as one group can be in multiple categories
        group_ids = set()
        dup_group_count = 0

        curr_unit_count = 0
        # reading data and excluding some groups
        inp_dirs = []
        for inp_dir in self.input():
            inp_dirs.append(inp_dir.path)
            for inp_group_file_path in get_file_paths(inp_dir.path):
                group_id = get_file_name(inp_group_file_path)
                if group_id in group_ids:
                    dup_group_count += 1
                    continue
                group_ids.add(group_id)
                if self._is_excluded(group_id):
                    excluded_group_count += 1
                    continue
                units = [u for u in
                         read_csv_file(inp_group_file_path, sep='\t')]
                list_group_units.append(units)

        # partitioning
        logger.info("Partitioning `%s`." % " ".join([idp for idp in inp_dirs]))
        tr_part, \
        val_part, \
        test_part = partition(list_group_units, train_part=self.train_part,
                              val_part=self.val_part, test_part=self.test_part)

        # dumping to the storage
        for title, part in zip(['train', 'val', 'test'],
                               [tr_part, val_part, test_part]):
            if len(part):
                for group_units in part:
                    group_id = group_units[0][OutputFields.GROUP_ID]
                    group_file_path = comb_paths(self.act_out_dir_path, title,
                                                 '%s.csv' % group_id)
                    write_group_to_csv(group_file_path, group_units, sep='\t')

        # logging stats
        train_rev_count = np.sum([len(gr) for gr in tr_part])
        val_rev_count = np.sum([len(gr) for gr in val_part])
        test_rev_count = np.sum([len(gr) for gr in test_part])

        stats = OrderedDict()
        stats['General'] = OrderedDict()
        stats['General']['excluded_group_count'] = excluded_group_count
        stats['General']['duplicate_group_count'] = dup_group_count
        stats['General']['train_groups'] = len(tr_part)
        stats['General']['train_rev_count'] = train_rev_count
        stats['General']['val_groups'] = len(val_part)
        stats['General']['val_rev_count'] = val_rev_count
        stats['General']['test_groups'] = len(test_part)
        stats['General']['test_rev_count'] = test_rev_count

        logger.info(format_stats(stats))
Esempio n. 5
0
 def __init__(self, *args, **kwargs):
     super(Partition, self).__init__(*args, **kwargs)
     self.act_out_dir_path = os.path.join(GlobalConfig().out_dir_path,
                                          FOLDER_NAME)
Esempio n. 6
0
 def _is_excluded(self, group_id):
     if GlobalConfig().dataset == 'amazon':
         return group_id in AmazonFields.EXCLUDED_GROUP_IDS
     if GlobalConfig().dataset == 'yelp':
         return group_id in YelpFields.EXCLUDED_GROUP_IDS