def get_papers(names): inp = [{"ids": [""], "names": [n]} for n in names] out = calc_reviewer_db_mapping(inp, db, author_col="name", author_field='authors') print(out.shape) data = {} for j, n in enumerate(names): ind, = out[:, j].nonzero() _, papers = torch.topk(torch.tensor(mat[:, ind].sum(-1)), 25) # print("Author:", n) # for i in papers: # print("\t", accepted_submissions[i].content["title"]) # print() # print() data[n] = [abstract_keys[p] for p in papers.tolist()] return data
for i, data in enumerate(reviewer_data_orig): if data['areaChair']: reviewer_remapping[i] = len(reviewer_data) reviewer_data.append(data) else: reviewer_remapping[i] = -1 for data in reviewer_data: if 'name' in data: data['names'] = [data['name']] del data['name'] reviewer_names = [x['names'][0] for x in reviewer_data] print(f'Have {len(reviewer_data)} reviewers', file=sys.stderr) with open(args.db_file, "r") as f: db = [json.loads(x) for x in f] # for debug db_abs = [x['paperAbstract'] for x in db] rdb = calc_reviewer_db_mapping(reviewer_data, db, author_field='authors') # At least half of the above papers are not authored by reviewers, hacking them out includes_reviewer = rdb.sum(axis=1) new_db = [] for i, paper in enumerate(db): if includes_reviewer[i] >= 1: new_db.append(paper) db = new_db db_abs = [x['paperAbstract'] for x in db] rdb = calc_reviewer_db_mapping(reviewer_data, db, author_field='authors') # Calculate or load paper similarity matrix if args.paper_matrix and os.path.exists(args.paper_matrix): mat = np.load(args.paper_matrix) assert (mat.shape[0] == len(submission_abs)
help="Which field to filter on (name,id)") parser.add_argument( "--bid_file", type=str, required=True, help= "A file containing numpy array of bids (0 = COI, 1 = no, 2 = maybe, 3 = yes)" ) args = parser.parse_args() # Load the data with open(args.suggestion_file, "r") as f: submissions = [json.loads(x) for x in f] with open(args.reviewer_file, "r") as f: reviewer_data = [json.loads(x) for x in f] reviewer_names = [x['names'][0] for x in reviewer_data] bids = np.load(args.bid_file) mapping = suggest_utils.calc_reviewer_db_mapping( reviewer_data, submissions, author_col=args.filter_field, author_field='assignedReviewers') all_assignments = np.sum(mapping) # Total of all bid scores, minus one, divided by number of assignments for bid in range(4): bid_count = np.sum(np.where((mapping == 1) & (bids == bid), 1, 0)) print(f'Ratio of {bid}: {bid_count/all_assignments}')
def main(): # -------------------------------------------------------------------------- # Part 1: Read in the arguments # -------------------------------------------------------------------------- args = parse_args() # -------------------------------------------------------------------------- # Part 2: Load the data and calculate similarity between submissions and # reviewers # -------------------------------------------------------------------------- data_load_start_time = time.time() with open(args.submission_file, "r") as f: submissions = [json.loads(x) for x in f] submission_abs = [x['paperAbstract'] for x in submissions] with open(args.reviewer_file, "r") as f: reviewer_data = [json.loads(x) for x in f] for data in reviewer_data: if 'name' in data: data['names'] = [data['name']] del data['name'] reviewer_names = [x['names'][0] for x in reviewer_data] with open(args.db_file, "r") as f: db = [json.loads(x) for x in f] # for debug db_abs = [x['paperAbstract'] for x in db] rdb = calc_reviewer_db_mapping(reviewer_data, db, author_field='authors') # FIXME: about half of the above papers are bollocks -- quick hack to filter # to those papers actually authored by reviewers includes_reviewer = rdb.sum(axis=1) new_db = [] for i, paper in enumerate(db): if includes_reviewer[i] >= 1: new_db.append(paper) db = new_db db_abs = [x['paperAbstract'] for x in db] rdb = calc_reviewer_db_mapping(reviewer_data, db, author_field='authors') data_load_end_time = time.time() data_load_time = round((data_load_end_time - data_load_start_time) / 60, 2) print(f"Time loading and preprocessing data: {data_load_time} minutes", file=sys.stderr) similarity_matrix_start_time = time.time() # Calculate or load paper similarity matrix if args.load_paper_matrix: mat = np.load(args.load_paper_matrix) assert (mat.shape[0] == len(submission_abs) and mat.shape[1] == len(db_abs)) else: print('Loading model', file=sys.stderr) model, epoch = load_model(None, args.model_file, force_cpu=True) model.eval() assert not model.training mat = calc_similarity_matrix(model, db_abs, submission_abs) if args.save_paper_matrix: np.save(args.save_paper_matrix, mat) similarity_matrix_end_time = time.time() similarity_matrix_time = round( (similarity_matrix_end_time - similarity_matrix_start_time) / 60, 2) print( "Time calculating paper similarity matrix:" f" {similarity_matrix_time} minutes", file=sys.stderr) aggregation_start_time = time.time() # Calculate reviewer scores based on paper similarity scores if args.load_aggregate_matrix: reviewer_scores = np.load(args.load_aggregate_matrix) assert (reviewer_scores.shape[0] == len(submission_abs) and reviewer_scores.shape[1] == len(reviewer_names)) else: print('Calculating aggregate reviewer scores', file=sys.stderr) reviewer_scores = calc_aggregate_reviewer_score( rdb, mat, args.aggregator) if args.save_aggregate_matrix: np.save(args.save_aggregate_matrix, reviewer_scores) aggregation_end_time = time.time() aggregation_time = round( (aggregation_end_time - aggregation_start_time) / 60, 2) print( "Time calculating aggregated similarity matrix:" f" {aggregation_time} minutes", file=sys.stderr) formulization_start_time = time.time() # -------------------------------------------------------------------------- # Part 3: Adjust reviewer_scores based on COI, AC role; add quota # constraints; optionally split into subproblems by track # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Part 3(a): Adjust reviewer_scores based on COIs # -------------------------------------------------------------------------- cois = np.where(np.load(args.bid_file) == 0, 1, 0) if args.bid_file else None if cois is not None: num_cois = np.sum(cois) print(f"Applying {num_cois} COIs", file=sys.stderr) reviewer_scores = np.where(cois == 0, reviewer_scores, reviewer_scores - 110) # -------------------------------------------------------------------------- # Part 3(b): Load reviewer specific quotas # -------------------------------------------------------------------------- quotas = {} if args.quota_file: quotas = assign_quotas(reviewer_data, args.quota_file, args.max_papers_per_reviewer, area_chairs=args.area_chairs) print(f"Set {len(quotas)} reviewer quotas", file=sys.stderr) # -------------------------------------------------------------------------- # Part 3(c): Adjust reviewer_scores based on ACs # If --area_chairs is specified, only ACs get papers. If it is not # specified, SACs and ACs should not get papers or be shown as similar # reviewers (i.e., the reviewer_score is set to -150 for those positions) # -------------------------------------------------------------------------- reviewer_scores, num_included, num_excluded = exclude_positions( reviewer_data, reviewer_scores, area_chairs=args.area_chairs) print(f"Excluded {num_excluded} reviewers/chairs, leaving {num_included}", file=sys.stderr) # -------------------------------------------------------------------------- # Part 4: Break the optimization into subproblems # If --track is not specified, there will be a single subproblem called # ``all_tracks``, although ACs or reviewers will be excluded as necessary. # If --track is specified, the matrix will be broken into one optimization # subproblem per track # -------------------------------------------------------------------------- optimization_problems, problem_papers, problem_reviewers, problem_quotas = ( split_by_subproblem(reviewer_data, submissions, reviewer_scores, quotas, by_track=args.track, area_chairs=args.area_chairs)) formulization_end_time = time.time() formulization_time = round( (formulization_end_time - formulization_start_time) / 60, 2) print( f"Time formulating optimization problem: {formulization_time} minutes", file=sys.stderr) optimization_start_time = time.time() # -------------------------------------------------------------------------- # Part 5: Calculate a reviewer assignment based on the constraints # -------------------------------------------------------------------------- problem_assignments = {} problem_scores = {} for problem in optimization_problems.keys(): final_scores = optimization_problems[problem] if args.anonymity_multiplier != 1.0: print( "Calculating initial assignment of reviewers for category" f" {problem}", file=sys.stderr) final_scores, assignment_score = create_suggested_assignment( final_scores, min_papers_per_reviewer=args.min_papers_per_reviewer, max_papers_per_reviewer=args.max_papers_per_reviewer, reviews_per_paper=args.reviews_per_paper, quotas=problem_quotas[problem], anonymity_multiplier=args.anonymity_multiplier) print( "Done calculating initial assignment," f" total score: {assignment_score}", file=sys.stderr) final_scores += np.random.random(final_scores.shape) * 1e-4 print(f"Calculating assignment of reviewers for category {problem}", file=sys.stderr) # final_scores includes the penalties for COI. The constraints for CP # itself are only the quota constraints (max/min # of papers a reviewer # wants to review) assignment, assignment_score = create_suggested_assignment( final_scores, min_papers_per_reviewer=args.min_papers_per_reviewer, max_papers_per_reviewer=args.max_papers_per_reviewer, reviews_per_paper=args.reviews_per_paper, quotas=problem_quotas[problem]) problem_assignments[problem] = assignment problem_scores[problem] = assignment_score print(f"Done calculating assignment. Total score: {assignment_score}", file=sys.stderr) if assignment is None: warnings.warn(f"No solution found for category {problem}", RuntimeWarning) optimization_end_time = time.time() optimization_time = round( (optimization_end_time - optimization_start_time) / 60, 2) print( "Time calculating optimal assignment of papers:" f" {optimization_time} minutes", file=sys.stderr) # -------------------------------------------------------------------------- # Part 6: Parse the assignments into a dictionary of reviewer IDs and other # info for each submission # -------------------------------------------------------------------------- global_assignments = parse_assignments( submissions=submissions, paper_similarity_matrix=mat, optimization_matrices=optimization_problems, problem_papers=problem_papers, problem_reviewers=problem_reviewers, problem_assignments=problem_assignments, by_track=args.track, num_assigned=args.reviews_per_paper, num_similar=args.num_similar_to_list) # -------------------------------------------------------------------------- # Part 7: Print out the results in jsonl format # -------------------------------------------------------------------------- jsonl_data = get_jsonl_rows(assignments=global_assignments, submissions=submissions, reviewers=reviewer_data, db_papers=db) with open(args.suggestion_file, 'w') as outf: for entry in jsonl_data: if args.output_type == 'json': print(json.dumps(entry), file=outf) elif args.output_type == 'text': print_text_report(entry, file=outf) else: raise ValueError(f'Illegal output_type {args.output_type}') print(f"Done creating suggestions, written to {args.suggestion_file}\n", file=sys.stderr) # -------------------------------------------------------------------------- # Part 8 (optional): Print out the results in more human-readable # spreadsheets # -------------------------------------------------------------------------- # ACL-2021: We are outputting an alternative data file to easily create # a per-track spreadsheet of assigned reviewers, as well as a global file # with the minimum assignment information if args.assignment_spreadsheet: global_header_info = get_csv_header( reviews_per_paper=args.reviews_per_paper, num_similar=args.num_similar_to_list, area_chairs=args.area_chairs, is_global=True) track_header_info = get_csv_header( reviews_per_paper=args.reviews_per_paper, num_similar=args.num_similar_to_list, area_chairs=args.area_chairs, is_global=False) coi_header_info = track_header_info + ['Original track'] global_data, track_data = get_csv_rows( assignments=global_assignments, reviewers=reviewer_data, cois=cois, reviews_per_paper=args.reviews_per_paper, area_chairs=args.area_chairs) global_rows, global_softconf_uploadable = global_data track_rows, track_softconf_uploadables = track_data # Separate the input file base from its extension so we can print # multiple files with the same general schema file_base, file_extension = (os.path.splitext( args.assignment_spreadsheet)[:2]) # Open the file path given in the arguments as the global assignment # spreadsheet, writing each row with open(args.assignment_spreadsheet, 'w+') as f: writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(global_header_info) for entry in global_rows: writer.writerow(entry) with open(file_base + '.txt', 'w+') as f: for line in global_softconf_uploadable: print(line, file=f) # For each track, create a file as a csv spreadsheet for all the track # submissions and their reviewer assignments for track in track_rows.keys(): alphanum_track = '-'.join(re.split(r'[\W,:]+', track)) filename = f'{file_base}_{alphanum_track}{file_extension}' with open(filename, 'w+') as f: writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) if track == 'COI': writer.writerow(coi_header_info) else: writer.writerow(track_header_info) for entry in track_rows[track]: writer.writerow(entry) filename = f'{file_base}_{alphanum_track}.txt' with open(filename, 'w+') as f: for line in track_softconf_uploadables[track]: print(line, file=f)
reviewers_by_track[data['track']].extend(data['names']) reviewer_names = [x['names'][0] for x in reviewer_data] num_tracks = len(acs_by_track) assert set(sacs_by_track.keys()) == set(acs_by_track.keys()) #assert set(sacs_by_track.keys()) == sub_tracks # there's a COI track, with no papers (yet) # FIXME: someone has AC roles 'Information Extraction:NLP Applications' with open(args.db_file, "r") as f: db = [json.loads(x) for x in f] # for debug db_abs = [x['paperAbstract'] for x in db] # create binary matrix of reviewer x paper rdb = calc_reviewer_db_mapping(reviewer_data, db, author_col=args.filter_field, author_field='authors') # Calculate or load paper similarity matrix if args.load_paper_matrix: mat = np.load(args.load_paper_matrix) assert (mat.shape[0] == len(submission_abs) and mat.shape[1] == len(db_abs)) else: print('Loading model', file=sys.stderr) model, epoch = load_model(None, args.model_file, force_cpu=True) model.eval() assert not model.training mat = calc_similarity_matrix(model, db_abs, submission_abs) if args.save_paper_matrix: np.save(args.save_paper_matrix, mat)