def create_dating_schedule(person_df, n_meeting=10): """ Function to create speed dating schedule at CCN 2018 conference Parameters ========== person_df: pandas dataframe contains - PersonID, FullName, Abstract n_meeting: int, number of meeting we would like to have Output ====== schedule: list, list of person id and person ids to meet in the following format: [PersonID, [PersonID to meet]] """ # linear programming persons_1 = list(map(preprocess, list(person_df['Abstract']))) persons_2 = list(map(preprocess, list(person_df['Abstract']))) A = affinity_computation(persons_1, persons_2, n_components=10, min_df=1, max_df=0.8, weighting='tfidf', projection='pca') # constraints, conflict of interest A[np.arange(len(A)), np.arange(len(A))] = -1000 # for dating at CCN v, K, d = create_lp_matrix(A, min_reviewers_per_paper=n_meeting, max_reviewers_per_paper=n_meeting, min_papers_per_reviewer=n_meeting, max_papers_per_reviewer=n_meeting) x_sol = linprog(v, K, d)['x'] b = create_assignment(x_sol, A) output = [] for i in range(len(b)): r = [list(person_df['PersonID'])[b_] for b_ in np.nonzero(b[i])[0]] output.append([list(person_df.PersonID)[i], r]) # make optimal schedule schedule = nest_answer( output, format_answer(color_graph(build_line_graph(output)))) return schedule
def create_dating_schedule(person_df, n_meeting=10): """ Function to create speed dating schedule at CCN 2018 conference Parameters ========== person_df: pandas dataframe contains - PersonID, FullName, Abstract n_meeting: int, number of meeting we would like to have Output ====== schedule: list, list of person id and person ids to meet in the following format: [PersonID, [PersonID to meet]] """ # linear programming persons_1 = list(map(preprocess, list(person_df['Abstract']))) persons_2 = list(map(preprocess, list(person_df['Abstract']))) A = affinity_computation(persons_1, persons_2, n_components=10, min_df=1, max_df=0.8, weighting='tfidf', projection='pca') # constraints, conflict of interest A[np.arange(len(A)), np.arange(len(A))] = -1000 # for dating at CCN v, K, d = create_lp_matrix( A, min_reviewers_per_paper=n_meeting, max_reviewers_per_paper=n_meeting, min_papers_per_reviewer=n_meeting, max_papers_per_reviewer=n_meeting ) x_sol = linprog(v, K, d)['x'] b = create_assignment(x_sol, A) output = [] for i in range(len(b)): r = [list(person_df['PersonID'])[b_] for b_ in np.nonzero(b[i])[0]] output.append([list(person_df.PersonID)[i], r]) # make optimal schedule schedule = nest_answer(output, format_answer(color_graph(build_line_graph(output)))) return schedule
def assign_articles_to_reviewers(article_df, reviewer_df, people_df): """ Perform reviewer-assignment from dataframe of article, reviewer, and people Parameters ========== article_df: a dataframe that has columns `PaperID`, `Title`, `Abstract`, and `PersonIDList` where PersonIDList contains string of simicolon separated list of PersonID reviewer_df: a dataframe that has columns `PersonID` and `Abstract` people_df: dataframe that has columns `PersonID`, `FullName` We assume `PersonID` is an integer Output ====== article_assignment_df: an assigned reviewers dataframe, each row of article will have list of reviewers in `ReviewerIDList` column and their name in reviewer_names """ papers = list( (article_df['Title'] + ' ' + article_df['Abstract']).map(preprocess)) reviewers = list(reviewer_df['Abstract'].map(preprocess)) # Calculate conflict of interest based on co-authors coauthors_df = pd.DataFrame( [[int(r.PaperID), int(co_author)] for _, r in article_df.iterrows() for co_author in r.PersonIDList.split(';')], columns=['PaperID', 'PersonID']) article_df['paper_id'] = list(range(len(article_df))) reviewer_df['person_id'] = list(range(len(reviewer_df))) coi_df = coauthors_df.merge(article_df[['PaperID', 'paper_id']], on='PaperID').merge( reviewer_df[['PersonID', 'person_id']], on='PersonID')[['paper_id', 'person_id']] # calculate affinity matrix A = affinity_computation(papers, reviewers, n_components=10, min_df=2, max_df=0.8, weighting='tfidf', projection='pca') # trim distance that are too high A_trim = [] for r in range(len(A)): a = A[r, :] a[np.argsort(a)[0:200]] = 0 A_trim.append(a) A_trim = np.vstack(A_trim) # assign conflict of interest to have high negative cost for i, j in zip(coi_df.paper_id.tolist(), coi_df.person_id.tolist()): A_trim[i, j] = -1000 # for CCN case, v, K, d = create_lp_matrix(A_trim, min_reviewers_per_paper=6, max_reviewers_per_paper=6, min_papers_per_reviewer=4, max_papers_per_reviewer=6) x_sol = linprog(v, K, d)['x'] b = create_assignment(x_sol, A_trim) reviewer_ids = list(reviewer_df.PersonID) reviewer_name_dict = { r['PersonID']: r['FullName'] for _, r in people_df.iterrows() } # map reviewer id to reviewer name assignments = [] for i in range(len(b)): assignments.append([ i, [reviewer_ids[b_] for b_ in np.nonzero(b[i])[0]], [ reviewer_name_dict[reviewer_ids[b_]] for b_ in np.nonzero(b[i])[0] ] ]) assignments_df = pd.DataFrame( assignments, columns=['paper_id', 'ReviewerIDList', 'reviewer_names']) assignments_df['ReviewerIDList'] = assignments_df.ReviewerIDList.map( lambda e: ';'.join(str(e_) for e_ in e)) assignments_df['reviewer_names'] = assignments_df.reviewer_names.map( lambda x: ';'.join(x)) article_assignment_df = article_df.merge(assignments_df, on='paper_id').drop('paper_id', axis=1) return article_assignment_df
for r in range(len(A)): a = A[r, :] a[np.argsort(a)[0:n_trim]] = 0 A_trim.append(a) A_trim = np.vstack(A_trim) else: A_trim = A print('Solving a matching problem...') v, K, d = create_lp_matrix(A_trim, min_reviewers_per_paper=n_match, max_reviewers_per_paper=n_match, min_papers_per_reviewer=n_match, max_papers_per_reviewer=n_match) x_sol = linprog(v, K, d)['x'] b = create_assignment(x_sol, A_trim) if (b.sum() == 0): print( 'Seems like the problem does not converge, try reducing <n_trim> but not too low!' ) else: print('Successfully assigned all the match!') if (b.sum() != 0): output = [] user_ids_map = {ri: r['user_id'] for ri, r in df.iterrows()} for i in range(len(b)): match_ids = [str(user_ids_map[b_]) for b_ in np.nonzero(b[i])[0]] output.append({ 'user_id': user_ids_map[i], 'match_ids': ';'.join(match_ids)
lambda x: create_coi_author_ids(x, reviewer_df)) cois = submission_df.AuthorsList.map( lambda x: create_coi_list(x, reviewer_df)) cois_df = pd.DataFrame(cois + cois_ids, columns=['AuthorsList']) for i, r in cois_df.iterrows(): if len(r['AuthorsList']) > 0: for idx in r['AuthorsList']: A[i, idx] = -1000 # assignment A_a, A_b = A[:, :len(reviewer_a_df)], A[:, len(reviewer_a_df):] v, K, d = create_lp_matrix(A_a, min_reviewers_per_paper=2, max_reviewers_per_paper=2, min_papers_per_reviewer=10, max_papers_per_reviewer=12) x_sol = linprog(v, K, d)['x'] b_a = create_assignment(x_sol, A_a) v, K, d = create_lp_matrix(A_b, min_reviewers_per_paper=2, max_reviewers_per_paper=2, min_papers_per_reviewer=10, max_papers_per_reviewer=12) x_sol = linprog(v, K, d)['x'] b_b = create_assignment(x_sol, A_b) reviewer_a_map = {i: r['UserID'] for i, r in reviewer_a_df.iterrows()} reviewer_b_map = {i: r['UserID'] for i, r in reviewer_b_df.iterrows()} paper_id_map = {i: r['PaperID'] for i, r in submission_df.iterrows()} assignments_a_df = create_assignment_dataframe(b_a, reviewer_a_map, paper_id_map, pool_group='a') assignments_b_df = create_assignment_dataframe(b_b, reviewer_b_map,
def calculate_result(self, reviewer_data, article_data, people_data, coi_data, min_rev_art, max_rev_art, min_art_rev, max_art_rev): """ Generates a csv file with the resulting assignment while it updates the status of the process using Celery """ cur_progress = 0 max_progress = 100 article_data = pd.DataFrame(article_data) people_data = pd.DataFrame(people_data) coauthors_df = pd.DataFrame([[r.PaperID, co_author] for _, r in article_data.iterrows() for co_author in r.PersonIDList.split(';')], columns = ['PaperID', 'PersonID']) if reviewer_data is None: # extract reviewer data from articles coauthor_articles = coauthors_df.merge(article_data)[['PersonID', 'Abstract']] coauthor_abtracts = coauthor_articles.groupby('PersonID').\ agg({'Abstract': lambda x: ''.join(x)}) reviewer_data = pd.DataFrame(zip(coauthor_abtracts.index, coauthor_abtracts.Abstract), columns=['PersonID', 'Abstract']) else: reviewer_data = pd.DataFrame(reviewer_data) reviewer_data.PersonID = reviewer_data.PersonID.apply(str) if coi_data is not None: coi_data = pd.DataFrame(coi_data) update_frequency = 1 cur_progress += int(max_progress/6.) self.update_progress( cur_progress, max_progress, update_frequency=update_frequency, ) # this performs the topic modeling (LSA) a = prm.compute_affinity(reviewer_data.Abstract, article_data.Abstract) cur_progress += int(max_progress/6.) self.update_progress( cur_progress, max_progress, update_frequency=update_frequency, ) # if coi_data available, then add as if they were co-authors if coi_data is not None: coi_data.PersonID = coi_data.PersonID.apply(str) coauthors_df = pd.concat((coauthors_df, coi_data)) # articles article_data2 = article_data.copy() article_data2.index = article_data2.PaperID article_data2['id'] = range(article_data2.shape[0]) coi_row = np.array(article_data2.loc[coauthors_df.PaperID].id.tolist()) # persons reviewer_data2 = reviewer_data.copy() reviewer_data2.index = reviewer_data2.PersonID reviewer_data2['id'] = range(reviewer_data2.shape[0]) coi_column = np.array(reviewer_data2.loc[coauthors_df.PersonID].id.tolist()) for i, j in zip(coi_row, coi_column): a[i, j] = -1000.#np.inf v, A, d = prm.create_lp_matrices(a, min_rev_art, max_rev_art, min_art_rev, max_art_rev) v = v.flatten() d = d.flatten() cur_progress += int(max_progress/6.) self.update_progress( cur_progress, max_progress, update_frequency=update_frequency, ) solver = pywraplp.Solver('SolveReviewerAssignment', pywraplp.Solver.GLOP_LINEAR_PROGRAMMING) infinity = solver.Infinity() n, m = A.shape x = [[]]*m c = [0]*n for j in range(m): x[j] = solver.NumVar(-infinity, infinity, 'x_%u' % j) # state objective function objective = solver.Objective() for j in range(m): objective.SetCoefficient(x[j], v[j]) objective.SetMaximization() # state the constraints for i in range(n): c[i] = solver.Constraint(-infinity, d[i]) # update status bar if np.mod(i, int(n/10)) == 0: cur_progress += 3 self.update_progress( cur_progress, max_progress, update_frequency=update_frequency, ) for j in A.col[A.row == i]: c[i].SetCoefficient(x[j], A.data[np.logical_and(A.row == i, A.col == j)][0]) result_status = solver.Solve() if result_status != 0: print "The final solution might not converged" x_sol = np.array([x_tmp.SolutionValue() for x_tmp in x]) #x = prm.linprog_solve(v, ne, d) x_sol = (x_sol > 0.5) cur_progress += int(max_progress/6.) self.update_progress( 4*int(max_progress/6.), max_progress, update_frequency=update_frequency, ) b = prm.create_assignment(x_sol, a) self.update_progress( 5*int(max_progress/6.), max_progress, update_frequency=update_frequency, ) assignment_df = article_data[['PaperID', 'Title']] assignment_df['Reviewers'] = '' assignment_df['ReviewerIDs'] = '' for i in range(b.shape[0]): paper_reviewers = np.where(b[i, :])[0] assignment_df.Reviewers.iloc[i] = ', '.join(list(people_data.FullName.iloc[paper_reviewers].copy())) # assignment_df.ReviewerIDs.iloc[i] = ', '.join(list(people_data.PersonID.iloc[paper_reviewers].copy())) self.update_progress( 6*int(max_progress/6.), max_progress, update_frequency=update_frequency, ) # transform to ascii assignment_df.Title.apply(lambda x: unicode(x)) assignment_df.Reviewers.apply(lambda x: unicode(x)) # , 'result': assignment_df.to_csv(None, na_rep='', index=False) # return {'task': {'status': 'SUCCESS'}} return assignment_df.to_csv(None, na_rep='', index=False, encoding='utf-8')
def calculate_result(self, reviewer_data, article_data, people_data, coi_data, min_rev_art, max_rev_art, min_art_rev, max_art_rev): """ Generates a csv file with the resulting assignment while it updates the status of the process using Celery """ cur_progress = 0 max_progress = 100 article_data = pd.DataFrame(article_data) people_data = pd.DataFrame(people_data) coauthors_df = pd.DataFrame( [[r.PaperID, co_author] for _, r in article_data.iterrows() for co_author in r.PersonIDList.split(';')], columns=['PaperID', 'PersonID']) if reviewer_data is None: # extract reviewer data from articles coauthor_articles = coauthors_df.merge(article_data)[[ 'PersonID', 'Abstract' ]] coauthor_abtracts = coauthor_articles.groupby('PersonID').\ agg({'Abstract': lambda x: ''.join(x)}) reviewer_data = pd.DataFrame(zip(coauthor_abtracts.index, coauthor_abtracts.Abstract), columns=['PersonID', 'Abstract']) else: reviewer_data = pd.DataFrame(reviewer_data) reviewer_data.PersonID = reviewer_data.PersonID.apply(str) if coi_data is not None: coi_data = pd.DataFrame(coi_data) update_frequency = 1 cur_progress += int(max_progress / 6.) self.update_progress( cur_progress, max_progress, update_frequency=update_frequency, ) # this performs the topic modeling (LSA) a = prm.compute_affinity(reviewer_data.Abstract, article_data.Abstract) cur_progress += int(max_progress / 6.) self.update_progress( cur_progress, max_progress, update_frequency=update_frequency, ) # if coi_data available, then add as if they were co-authors if coi_data is not None: coi_data.PersonID = coi_data.PersonID.apply(str) coauthors_df = pd.concat((coauthors_df, coi_data)) # articles article_data2 = article_data.copy() article_data2.index = article_data2.PaperID article_data2['id'] = range(article_data2.shape[0]) coi_row = np.array(article_data2.loc[coauthors_df.PaperID].id.tolist()) # persons reviewer_data2 = reviewer_data.copy() reviewer_data2.index = reviewer_data2.PersonID reviewer_data2['id'] = range(reviewer_data2.shape[0]) coi_column = np.array( reviewer_data2.loc[coauthors_df.PersonID].id.tolist()) for i, j in zip(coi_row, coi_column): a[i, j] = -1000. #np.inf v, A, d = prm.create_lp_matrices(a, min_rev_art, max_rev_art, min_art_rev, max_art_rev) v = v.flatten() d = d.flatten() cur_progress += int(max_progress / 6.) self.update_progress( cur_progress, max_progress, update_frequency=update_frequency, ) solver = pywraplp.Solver('SolveReviewerAssignment', pywraplp.Solver.GLOP_LINEAR_PROGRAMMING) infinity = solver.Infinity() n, m = A.shape x = [[]] * m c = [0] * n for j in range(m): x[j] = solver.NumVar(-infinity, infinity, 'x_%u' % j) # state objective function objective = solver.Objective() for j in range(m): objective.SetCoefficient(x[j], v[j]) objective.SetMaximization() # state the constraints for i in range(n): c[i] = solver.Constraint(-infinity, d[i]) # update status bar if np.mod(i, int(n / 10)) == 0: cur_progress += 3 self.update_progress( cur_progress, max_progress, update_frequency=update_frequency, ) for j in A.col[A.row == i]: c[i].SetCoefficient( x[j], A.data[np.logical_and(A.row == i, A.col == j)][0]) result_status = solver.Solve() if result_status != 0: print "The final solution might not converged" x_sol = np.array([x_tmp.SolutionValue() for x_tmp in x]) #x = prm.linprog_solve(v, ne, d) x_sol = (x_sol > 0.5) cur_progress += int(max_progress / 6.) self.update_progress( 4 * int(max_progress / 6.), max_progress, update_frequency=update_frequency, ) b = prm.create_assignment(x_sol, a) self.update_progress( 5 * int(max_progress / 6.), max_progress, update_frequency=update_frequency, ) assignment_df = article_data[['PaperID', 'Title']] assignment_df['Reviewers'] = '' assignment_df['ReviewerIDs'] = '' for i in range(b.shape[0]): paper_reviewers = np.where(b[i, :])[0] assignment_df.Reviewers.iloc[i] = ', '.join( list(people_data.FullName.iloc[paper_reviewers].copy())) # assignment_df.ReviewerIDs.iloc[i] = ', '.join(list(people_data.PersonID.iloc[paper_reviewers].copy())) self.update_progress( 6 * int(max_progress / 6.), max_progress, update_frequency=update_frequency, ) # transform to ascii assignment_df.Title.apply(lambda x: unicode(x)) assignment_df.Reviewers.apply(lambda x: unicode(x)) # , 'result': assignment_df.to_csv(None, na_rep='', index=False) # return {'task': {'status': 'SUCCESS'}} return assignment_df.to_csv(None, na_rep='', index=False, encoding='utf-8')