def main(): arg = get_args() summary = {} samples_stat = {} for crt in ELFCRTSymbols: base = None summary[crt] = {} print("##### %s ######" % crt) for f in os.listdir(arg.data_folder): full_path = os.path.join(arg.data_folder, f) if not os.path.isfile(full_path): continue if not f.startswith('CRT_999_' + crt): continue key = f.split('_999_')[2].replace('.csv', '') # print('key = %s' % key) if base is None: base = utils.toDf(full_path) summary[crt][key] = {} summary[crt][key]['count'] = 1 summary[crt][key]['df'] = base summary[crt][key]['path'] = full_path summary[crt]['current_key'] = key continue data = utils.toDf(full_path) base_df = base.drop(columns=['module name', 'target']) data_df = data.drop(columns=['module name', 'target']) pattern_found = False if not base_df.equals(data_df): show_diff(base_df, data_df) for old_key in summary[crt].keys(): if old_key == 'current_key': continue pattern_df = summary[crt][old_key]['df'].drop( columns=['module name', 'target']) if pattern_df.equals(data_df): pattern_found = True base = summary[crt][old_key]['df'] summary[crt]['current_key'] = old_key summary[crt][old_key]['count'] += 1 break if not pattern_found: base = data summary[crt][key] = {} summary[crt][key]['count'] = 1 summary[crt][key]['df'] = base summary[crt][key]['path'] = full_path summary[crt]['current_key'] = key else: summary[crt][summary[crt]['current_key']]['count'] += 1 samples_stat[crt] = show_crt_summary(summary, crt) patterns_output(samples_stat, arg.out_dir)
def __init__(self, args, round_no): self.datapath = args.data_folder self.hashTags = args.hash_tags self.withTags = args.with_tags self.output = args.output_folder self.enableKbeans = args.kbeans self.round = round_no self.depth = args.depth self.min_leaf_samples = args.min_leaf_samples self.tag_nums = args.tag_nums self.movies = utils.toDf(os.path.join(self.datapath, 'movies.csv')) print("total movies to be predict, %d" % int(len(self.movies))) self.ratings = utils.toDf(os.path.join(self.datapath, 'ratings.csv')) self.tags = utils.toDf(os.path.join(self.datapath, 'tags.csv')) self.abt = self.movies[['movieId', 'title', 'genres']].copy()
def load_data(data_folder, trial_mode): data_set = pd.DataFrame() for filename in os.listdir(data_folder): fullpath = os.path.join(data_folder, filename) if not os.path.isfile(fullpath): continue if not fullpath.endswith('.csv'): continue data = utils.toDf(fullpath) if data_set.empty: data_set = data else: data_set = pd.concat([data_set, data], ignore_index=True) # g = data_set.groupby('target') # data_set = pd.DataFrame(g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))) print('Import %d records in dataframe' % len(data_set)) if trial_mode and len(data_set) > 30000: # if trial_mode and len(data_set) > 300000: break return data_set