def run(self): init_logger(LOGGER_NAME) logger.info("Preparing `%s`." % self.inp_file_path) if GlobalConfig().dataset == 'amazon': iter = read_amazon_data(self.inp_file_path, replace_xml=True) else: iter = read_yelp_data(self.inp_file_path) for group_id, dus in iter: full_file_name = "%s.csv" % group_id out_file_path = comb_paths(self.act_out_dir_path, full_file_name) write_group_to_csv(out_file_path, dus, sep='\t')
def __init__(self, *args, **kwargs): super(PrepareFile, self).__init__(*args, **kwargs) self.act_out_dir_path = get_act_out_dir_path( GlobalConfig().out_dir_path, self.inp_file_path, FOLDER_NAME)
def run(self): log_file_path = comb_paths( GlobalConfig().out_dir_path, "logs", FOLDER_NAME, "%s.txt" % get_file_name(self.inp_file_path)) init_logger(LOGGER_NAME, output_path=log_file_path) init_unit_count = 0 init_group_count = 0 group_id_to_units = {} group_unit_counts = [] # 1. reading data and filtering out short/long reviews and # unpopular groups inp_dir = self.input()[0] logger.info("Subsampling `%s`." % inp_dir.path) for inp_file_path in iter_file_paths(inp_dir.path): group_id = get_file_name(inp_file_path) group_units = [] init_group_count += 1 for data_unit in read_csv_file(inp_file_path, sep='\t'): init_unit_count += 1 rev_text = data_unit[OutputFields.REV_TEXT].split() # removing too short and long reviews if len(rev_text) < self.min_rev_len or \ len(rev_text) > self.max_rev_len: continue group_units.append(data_unit) # removing unpopular groups if len(group_units) < self.min_revs: continue group_id_to_units[group_id] = group_units group_unit_counts.append(len(group_units)) if not len(group_id_to_units): raise ValueError("No groups to proceed.") # 2. filtering by percentile perc = np.percentile(group_unit_counts, self.percentile) # removing above a kth percentile groups subs_group_id_to_units = {} subs_units_count = 0 subs_units_max_count = 0 for group_id, group_units in group_id_to_units.items(): if len(group_units) < perc or perc == 1.: # making sure that the subsampled number of reviews does not # exceed a threshold unless most of the businesses only have # one review if self.max_total_revs is not None \ and (subs_units_count + len( group_units)) > self.max_total_revs: break subs_units_count += len(group_units) subs_units_max_count = max(subs_units_max_count, len(group_units)) subs_group_id_to_units[group_id] = group_units if subs_units_count == 0: raise ValueError("All units were subsampled out. " "Please adjust the parameters.") # 3. dumping to files write_groups_to_csv(self.act_out_dir_path, subs_group_id_to_units, sep='\t') # 4. logging statistics stats = OrderedDict() stats['General'] = OrderedDict() stats['General']['inp dir'] = inp_dir.path stats['Initial'] = OrderedDict() stats['Initial']['group count'] = init_group_count stats['Initial']['unit count'] = init_unit_count stats['After Filtering'] = OrderedDict() stats['After Filtering']['group count'] = len(group_id_to_units) stats['After Filtering']['unit count'] = np.sum(group_unit_counts) stats['After Filtering']['percentile count'] = perc stats['After Subsampling'] = OrderedDict() stats['After Subsampling']['group count'] = len(subs_group_id_to_units) stats['After Subsampling']['unit count'] = subs_units_count stats['After Subsampling'][ 'max units per group'] = subs_units_max_count stats_str = format_stats(stats) logger.info(stats_str)
def run(self): log_file_path = comb_paths(GlobalConfig().out_dir_path, "logs", "partitioning.txt") init_logger(LOGGER_NAME, output_path=log_file_path) excluded_group_count = 0 list_group_units = [] # tracking duplicate groups as one group can be in multiple categories group_ids = set() dup_group_count = 0 curr_unit_count = 0 # reading data and excluding some groups inp_dirs = [] for inp_dir in self.input(): inp_dirs.append(inp_dir.path) for inp_group_file_path in get_file_paths(inp_dir.path): group_id = get_file_name(inp_group_file_path) if group_id in group_ids: dup_group_count += 1 continue group_ids.add(group_id) if self._is_excluded(group_id): excluded_group_count += 1 continue units = [u for u in read_csv_file(inp_group_file_path, sep='\t')] list_group_units.append(units) # partitioning logger.info("Partitioning `%s`." % " ".join([idp for idp in inp_dirs])) tr_part, \ val_part, \ test_part = partition(list_group_units, train_part=self.train_part, val_part=self.val_part, test_part=self.test_part) # dumping to the storage for title, part in zip(['train', 'val', 'test'], [tr_part, val_part, test_part]): if len(part): for group_units in part: group_id = group_units[0][OutputFields.GROUP_ID] group_file_path = comb_paths(self.act_out_dir_path, title, '%s.csv' % group_id) write_group_to_csv(group_file_path, group_units, sep='\t') # logging stats train_rev_count = np.sum([len(gr) for gr in tr_part]) val_rev_count = np.sum([len(gr) for gr in val_part]) test_rev_count = np.sum([len(gr) for gr in test_part]) stats = OrderedDict() stats['General'] = OrderedDict() stats['General']['excluded_group_count'] = excluded_group_count stats['General']['duplicate_group_count'] = dup_group_count stats['General']['train_groups'] = len(tr_part) stats['General']['train_rev_count'] = train_rev_count stats['General']['val_groups'] = len(val_part) stats['General']['val_rev_count'] = val_rev_count stats['General']['test_groups'] = len(test_part) stats['General']['test_rev_count'] = test_rev_count logger.info(format_stats(stats))
def __init__(self, *args, **kwargs): super(Partition, self).__init__(*args, **kwargs) self.act_out_dir_path = os.path.join(GlobalConfig().out_dir_path, FOLDER_NAME)
def _is_excluded(self, group_id): if GlobalConfig().dataset == 'amazon': return group_id in AmazonFields.EXCLUDED_GROUP_IDS if GlobalConfig().dataset == 'yelp': return group_id in YelpFields.EXCLUDED_GROUP_IDS