def get_value(line, input_type): if input_type == "ast": return get_dfs(line) elif input_type == "leaf": return get_dfs(line, only_leaf=True) elif input_type == "source_code": return line[0]
def main(): parser = argparse.ArgumentParser(description="Generate datapoints from AST") parser.add_argument("--ast_fp", "-a", help="Filepath with the ASTs to be parsed") parser.add_argument( "--out_fp", "-o", default="/tmp/dps.txt", help="Filepath for the output dps" ) parser.add_argument( "--n_ctx", "-c", type=int, default=1000, help="Number of contexts for each dp" ) args = parser.parse_args() if os.path.exists(args.out_fp): os.remove(args.out_fp) logging.info("Number of context: {}".format(args.n_ctx)) num_dps = 0 logging.info("Loading asts from: {}".format(args.ast_fp)) with open(args.ast_fp, "r") as f, open(args.out_fp, "w") as fout: for line in file_tqdm(f): dp = json.loads(line.strip()) asts = separate_dps(dp, args.n_ctx) for ast, extended in asts: if len(ast) > 1: json.dump([get_dfs(ast), extended], fp=fout) fout.write("\n") num_dps += 1 logging.info("Wrote {} datapoints to {}".format(num_dps, args.out_fp))
def parse(): """ Parse args and handles list splitting """ parser = argparse.ArgumentParser() parser.add_argument('--grouping') parser.add_argument('--dataset', default='ml-1m') args = parser.parse_args() dfs = get_dfs(args.dataset) if args.grouping == 'genre': groups = group_by_genre(dfs['users'], dfs['ratings'], dfs['movies'], args.dataset) elif args.grouping == 'power': groups = group_by_power(dfs['users'], dfs['ratings'], args.dataset) elif args.grouping == 'state': groups = group_by_state(dfs['users'], args.dataset) else: grouping_to_func = { 'gender': group_by_gender, 'age': group_by_age, 'occupation': group_by_occupation, } groups = grouping_to_func[args.grouping](dfs['users']) for group in groups: print(group['name'], len(group['df'].index)) print(len(groups))
def get_dp(dp, n_ctx, child=False): get_mask = get_udc_masks if child else get_ud_masks asts = separate_dps(dp, n_ctx) rel_masks = separate_rel_mask(get_mask(dp, n_ctx), n_ctx) aug_dps = [] for (ast, ext), mask in zip(asts, rel_masks): aug_dps.append([get_dfs(ast), ext, mask]) return aug_dps
def main(args): """driver""" dfs = get_dfs(args.dataset) ratings_df = dfs['ratings'] movies_df = dfs['movies'] d = defaultdict(int) for i, row in ratings_df.iterrows(): d[row.movie_id] += 1 c = Counter(d) top_five_percent = c.most_common(len(d) // 20) print(top_five_percent) movie_ids = [movie_id[0] for movie_id in top_five_percent] with open('boycott_files/{}_top_five_percent_movies.csv'.format(args.dataset), 'w') as outfile: outfile.write(','.join([str(x) for x in movie_ids])) for movie_id in movie_ids: print(movies_df[movies_df.movie_id == movie_id].movie_title)
def external(file_path, suffix, context_size, overlap): outfile = "output/{}_dps.txt".format(suffix) if os.path.exists(outfile): os.remove(outfile) logging.info("Number of context: {}".format(context_size)) num_dps = 0 logging.info("Loading asts from: {}".format(file_path)) with open(file_path, "r") as f, open(outfile, "w") as fout: for line in file_tqdm(f): dp = json.loads(line.strip()) asts = rq6_separate_dps(dp, context_size, overlap) for ast, extended in asts: if len(ast) > 1: json.dump([get_dfs(ast), extended], fp=fout) fout.write("\n") num_dps += 1 logging.info("Wrote {} datapoints to {}".format(num_dps, outfile))
def main(args): """ Run the sandbox experiments """ out_prefix = 'out/' if args.send_to_out else "" times = OrderedDict() times['start'] = time.time() algos = ALGOS if args.movie_mean: algos = { 'MovieMean': MovieMean(), 'GlobalMean': GlobalMean(), } algos_for_standards = ALGOS_FOR_STANDARDS dfs = get_dfs(args.dataset) head_items = load_head_items(args.dataset) times['dfs_loaded'] = time.time() - times['start'] print('Got dataframes, took {} seconds'.format(times['dfs_loaded'])) print('Total examples: {}'.format(len(dfs['ratings'].index))) ratings_df, users_df, movies_df = dfs['ratings'], dfs['users'], dfs['movies'] if args.mode == 'info': print(ratings_df.memory_usage(index=True)) print(users_df.memory_usage(index=True)) print(movies_df.memory_usage(index=True)) print(ratings_df.info()) print(users_df.info()) return data = Dataset.load_from_df( ratings_df[['user_id', 'movie_id', 'rating']], reader=Reader() ) times['data_constructed'] = time.time() - times['dfs_loaded'] # note to reader: why are precision, recall, and ndcg all stuffed together in one string? # this ensures they will be computed all at once. Evaluation code will split them up for presentation metric_names = [] for measure in MEASURES: if '_' in measure: splitnames = measure.lower().split('_') metric_names += splitnames metric_names += [x + '_frac' for x in splitnames] metric_names += ['tail' + x for x in splitnames] else: metric_names.append(measure.lower()) metric_names = get_metric_names() if args.compute_standards: standard_results = defaultdict(list) for algo_name in algos_for_standards: for _ in range(args.num_standards): filename_ratingcv_standards = out_prefix + 'standard_results/{}_ratingcv_standards_for_{}.json'.format( args.dataset, algo_name) print('Computing standard results for {}'.format(algo_name)) if args.save_path is False: save_path = None elif args.save_path is None: save_path = os.getcwd() + '/' + out_prefix + 'predictions/standards/{}_{}_'.format(args.dataset, algo_name) else: save_path = args.save_path if 'KNN' in algo_name and args.dataset == 'ml-20m': # running this in parallel runs out of memory with KNN results = cross_validate_custom( algos_for_standards[algo_name], data, Dataset.load_from_df(pd.DataFrame(), reader=Reader()), [], [], MEASURES, NUM_FOLDS, n_jobs=1, head_items=head_items, save_path=save_path) else: results = cross_validate_custom( algos_for_standards[algo_name], data, Dataset.load_from_df(pd.DataFrame(), reader=Reader()), [], [], MEASURES, NUM_FOLDS, head_items=head_items, save_path=save_path) saved_results = {} for metric in metric_names: saved_results[metric] = np.mean(results[metric + '_all']) # frac_key = metric + '_frac_all' # if frac_key in results: # saved_results[frac_key] = np.mean(results[frac_key]) with open(filename_ratingcv_standards, 'w') as f: json.dump(saved_results, f) standard_results[algo_name].append(saved_results) standard_results_df = pd.DataFrame(standard_results[algo_name]) print(standard_results_df.mean()) standard_results_df.mean().to_csv('{}'.format( filename_ratingcv_standards).replace('.json', '_{}.csv'.format( args.num_standards) ) ) experiment_configs = [] if args.grouping == 'individual_users': experiment_configs += [{'type': 'individual_users', 'size': None}] elif args.grouping == 'sample': if args.sample_sizes: experiment_configs += [ { 'type': 'sample_users', 'size': sample_size } for sample_size in args.sample_sizes] else: raise ValueError( 'When using grouping="sample", you must provide a set of sample sizes' ) elif args.grouping in [ 'gender', 'age', 'power', 'state', 'genre', 'genre_strict', 'occupation', ]: experiment_configs += [{'type': args.grouping, 'size': None}] else: experiment_configs = [] uid_to_error = {} experimental_iterations = [] seed_base = args.indices[0] for config in experiment_configs: outname = out_prefix + concat_output_filename( args.dataset, config['type'], args.userfrac, args.ratingfrac, config['size'], args.num_samples, args.indices ) if config['type'] == 'individual_users': experimental_iterations = list(users_df.iterrows()) elif config['type'] == 'sample_users': experimental_iterations = [{ 'df': users_df.sample(config['size'], random_state=seed_base+index), # copies user_df 'name': '{} user sample'.format(config['size']) } for index in range(args.num_samples)] elif config['type'] == 'gender': for _ in range(args.num_samples): experimental_iterations += group_by_gender(users_df) elif config['type'] == 'age': for _ in range(args.num_samples): experimental_iterations += group_by_age(users_df) elif config['type'] == 'state': for _ in range(args.num_samples): experimental_iterations += group_by_state(users_df, dataset=args.dataset) elif config['type'] == 'genre': for _ in range(args.num_samples): experimental_iterations += group_by_genre( users_df=users_df, ratings_df=ratings_df, movies_df=movies_df, dataset=args.dataset) elif config['type'] == 'genre_strict': for _ in range(args.num_samples): experimental_iterations += group_by_genre_strict( users_df=users_df, ratings_df=ratings_df, movies_df=movies_df, dataset=args.dataset) elif config['type'] == 'power': for _ in range(args.num_samples): experimental_iterations += group_by_power(users_df=users_df, ratings_df=ratings_df, dataset=args.dataset) elif config['type'] == 'occupation': for _ in range(args.num_samples): experimental_iterations += group_by_occupation(users_df) experiment_identifier_to_uid_sets = {} for algo_name in algos: prep_boycott_tasks = ( delayed(prepare_boycott_task)( i, experimental_iteration, args, config, ratings_df, seed_base, outname, algo_name, algos[algo_name], head_items, data ) for i, experimental_iteration in enumerate(experimental_iterations) ) simulate_boycott_tasks = [] tic = time.time() out = Parallel(n_jobs=-1, verbose=5, max_nbytes=None)((x for x in prep_boycott_tasks)) for task_args, d in out: simulate_boycott_tasks.append(delayed(task)(*task_args)) experiment_identifier_to_uid_sets.update(d) print('parallelized prep_boycott_task took {} seconds'.format(time.time() - tic)) print('About to run Parallel() with {} tasks'.format(len(simulate_boycott_tasks))) out_dicts = Parallel(n_jobs=-1, verbose=5)((x for x in simulate_boycott_tasks)) for d in out_dicts: res = d['subset_results'] algo_name = d['algo_name'] uid = str(d['identifier']) + '_' + d['algo_name'] uid_to_error[uid] = { 'num_ratings': d['num_ratings'], 'num_users': d['num_users'], 'num_movies': d['num_movies'], 'name': d['name'], 'algo_name': d['algo_name'], } for metric in metric_names + ['fit_time', 'test_times', 'num_tested']: for group in ['all', 'non-boycott', 'boycott', 'like-boycott', 'all-like-boycott']: key = '{}_{}'.format(metric, group) # if group in ['boycott', ]: # val = np.nanmean(res[key]) vals = res.get(key) if vals: val = np.mean(res[key]) uid_to_error[uid].update({ key: val, }) standards_key = 'standards_' + key standards_vals = res.get(standards_key) if standards_vals: standards_val = np.mean(res[standards_key]) uid_to_error[uid].update({ standards_key: standards_val, }) err_df = pd.DataFrame.from_dict(uid_to_error, orient='index') uid_sets_outname = outname.replace('results/', 'uid_sets/uid_sets_') pd.DataFrame.from_dict(experiment_identifier_to_uid_sets, orient='index').to_csv(uid_sets_outname) if args.movie_mean: outname = outname.replace('results/', 'results/MOVIEMEAN_') err_df.to_csv(outname) print('Full runtime was: {} for {} experimental iterations'.format(time.time() - times['start'], len(experimental_iterations)))
def main(args): """ Calculate standards Configuration requqired for this function: you must have uid_sets files in the directory specified by the pathto argument uid_sets files are CSV files with: an iteration number (index) in one column, a list of boycott uids in 2nd column, and a list of like-boycott uids in a 3rd column uid lists are stored as strings delimited by semi-colon (;) """ starttime = time.time() dfs = get_dfs(args.dataset) head_items = load_head_items(args.dataset) ratings_df = dfs['ratings'] data = Dataset.load_from_df(ratings_df[['user_id', 'movie_id', 'rating']], reader=Reader()) files = os.listdir(args.pathto) boycott_uid_sets = {} like_boycotters_uid_sets = {} for file in files: if 'uid_sets' not in file or '.csv' not in file: continue if args.dataset not in file: #print('skip {} b/c dataset'.format(file)) continue if args.name_match and args.name_match not in file: continue print(file) uid_sets_df = pd.read_csv(args.pathto + '/' + file, dtype=str) for i, row in uid_sets_df.iterrows(): identifier_num = row[0] try: boycott_uid_set = set( [int(x) for x in row['boycott_uid_set'].split(';')]) except AttributeError: boycott_uid_set = set([]) try: like_boycotters_uid_set = set([ int(x) for x in row['like_boycotters_uid_set'].split(';') ]) except AttributeError: like_boycotters_uid_set = set([]) full_identifier = file.replace('uid_sets_', '') + '__' + identifier_num boycott_uid_sets[full_identifier] = boycott_uid_set like_boycotters_uid_sets[full_identifier] = like_boycotters_uid_set # now boycott_uid_sets and co. are filled up! if args.algo_name: algo_names = [args.algo_name] else: algo_names = list(ALGOS.keys()) out = {} for algo_name in algo_names: # why do we batch this - otherwise we could run out of memory if doing many experiment with one script run for batch_num, key_batch in enumerate( batch(list(boycott_uid_sets.keys()), 100)): print('On key batch {} of {} keys'.format(batch_num, len(boycott_uid_sets))) batch_b = {} batch_l = {} for key in key_batch: print(key) batch_b[key] = boycott_uid_sets[key] batch_l[key] = like_boycotters_uid_sets[key] # ideally we don't need to re-train the algorithm... we have the actual predictions saved for each rating within each crossfold! # if for some reason this was lost (or wasn't saved, e.g. using the pre-July 2018 version of this code) we can re-train # will take much longer if args.load_path == 'False': load_path = None elif args.load_path is None: load_path = os.getcwd( ) + '/predictions/standards/{}_{}_'.format( args.dataset, algo_name) else: load_path = args.load_path + '/standards/{}_{}_'.format( args.dataset, algo_name) res = cross_validate_many(ALGOS[algo_name], data, Dataset.load_from_df(pd.DataFrame(), reader=Reader()), batch_b, batch_l, MEASURES, NUM_FOLDS, verbose=False, head_items=head_items, load_path=load_path) out.update(res) dtstr = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") with open( 'standard_results/{}_{}_{}.json'.format( args.dataset, algo_name, dtstr), 'w') as f: json.dump(out, f) print(time.time() - starttime)