コード例 #1
0
def main():
    arg = get_args()
    summary = {}
    samples_stat = {}
    for crt in ELFCRTSymbols:
        base = None
        summary[crt] = {}
        print("##### %s ######" % crt)
        for f in os.listdir(arg.data_folder):
            full_path = os.path.join(arg.data_folder, f)
            if not os.path.isfile(full_path):
                continue
            if not f.startswith('CRT_999_' + crt):
                continue
            key = f.split('_999_')[2].replace('.csv', '')
            # print('key = %s' % key)
            if base is None:
                base = utils.toDf(full_path)
                summary[crt][key] = {}
                summary[crt][key]['count'] = 1
                summary[crt][key]['df'] = base
                summary[crt][key]['path'] = full_path
                summary[crt]['current_key'] = key
                continue
            data = utils.toDf(full_path)
            base_df = base.drop(columns=['module name', 'target'])
            data_df = data.drop(columns=['module name', 'target'])
            pattern_found = False
            if not base_df.equals(data_df):
                show_diff(base_df, data_df)
                for old_key in summary[crt].keys():
                    if old_key == 'current_key':
                        continue
                    pattern_df = summary[crt][old_key]['df'].drop(
                        columns=['module name', 'target'])
                    if pattern_df.equals(data_df):
                        pattern_found = True
                        base = summary[crt][old_key]['df']
                        summary[crt]['current_key'] = old_key
                        summary[crt][old_key]['count'] += 1
                        break
                if not pattern_found:
                    base = data
                    summary[crt][key] = {}
                    summary[crt][key]['count'] = 1
                    summary[crt][key]['df'] = base
                    summary[crt][key]['path'] = full_path
                    summary[crt]['current_key'] = key
            else:
                summary[crt][summary[crt]['current_key']]['count'] += 1
        samples_stat[crt] = show_crt_summary(summary, crt)
    patterns_output(samples_stat, arg.out_dir)
コード例 #2
0
 def __init__(self, args, round_no):
     self.datapath = args.data_folder
     self.hashTags = args.hash_tags
     self.withTags = args.with_tags
     self.output = args.output_folder
     self.enableKbeans = args.kbeans
     self.round = round_no
     self.depth = args.depth
     self.min_leaf_samples = args.min_leaf_samples
     self.tag_nums = args.tag_nums
     self.movies = utils.toDf(os.path.join(self.datapath, 'movies.csv'))
     print("total movies to be predict, %d" % int(len(self.movies)))
     self.ratings = utils.toDf(os.path.join(self.datapath, 'ratings.csv'))
     self.tags = utils.toDf(os.path.join(self.datapath, 'tags.csv'))
     self.abt = self.movies[['movieId', 'title', 'genres']].copy()
コード例 #3
0
def load_data(data_folder, trial_mode):
    data_set = pd.DataFrame()
    for filename in os.listdir(data_folder):
        fullpath = os.path.join(data_folder, filename)
        if not os.path.isfile(fullpath):
            continue
        if not fullpath.endswith('.csv'):
            continue
        data = utils.toDf(fullpath)
        if data_set.empty:
            data_set = data
        else:
            data_set = pd.concat([data_set, data], ignore_index=True)
        # g = data_set.groupby('target')
        # data_set = pd.DataFrame(g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True)))
        print('Import %d records in dataframe' % len(data_set))
        if trial_mode and len(data_set) > 30000:
            # if trial_mode and len(data_set) > 300000:
            break
    return data_set