def create_dating_schedule(person_df, n_meeting=10): """ Function to create speed dating schedule at CCN 2018 conference Parameters ========== person_df: pandas dataframe contains - PersonID, FullName, Abstract n_meeting: int, number of meeting we would like to have Output ====== schedule: list, list of person id and person ids to meet in the following format: [PersonID, [PersonID to meet]] """ # linear programming persons_1 = list(map(preprocess, list(person_df['Abstract']))) persons_2 = list(map(preprocess, list(person_df['Abstract']))) A = affinity_computation(persons_1, persons_2, n_components=10, min_df=1, max_df=0.8, weighting='tfidf', projection='pca') # constraints, conflict of interest A[np.arange(len(A)), np.arange(len(A))] = -1000 # for dating at CCN v, K, d = create_lp_matrix(A, min_reviewers_per_paper=n_meeting, max_reviewers_per_paper=n_meeting, min_papers_per_reviewer=n_meeting, max_papers_per_reviewer=n_meeting) x_sol = linprog(v, K, d)['x'] b = create_assignment(x_sol, A) output = [] for i in range(len(b)): r = [list(person_df['PersonID'])[b_] for b_ in np.nonzero(b[i])[0]] output.append([list(person_df.PersonID)[i], r]) # make optimal schedule schedule = nest_answer( output, format_answer(color_graph(build_line_graph(output)))) return schedule
def create_dating_schedule(person_df, n_meeting=10): """ Function to create speed dating schedule at CCN 2018 conference Parameters ========== person_df: pandas dataframe contains - PersonID, FullName, Abstract n_meeting: int, number of meeting we would like to have Output ====== schedule: list, list of person id and person ids to meet in the following format: [PersonID, [PersonID to meet]] """ # linear programming persons_1 = list(map(preprocess, list(person_df['Abstract']))) persons_2 = list(map(preprocess, list(person_df['Abstract']))) A = affinity_computation(persons_1, persons_2, n_components=10, min_df=1, max_df=0.8, weighting='tfidf', projection='pca') # constraints, conflict of interest A[np.arange(len(A)), np.arange(len(A))] = -1000 # for dating at CCN v, K, d = create_lp_matrix( A, min_reviewers_per_paper=n_meeting, max_reviewers_per_paper=n_meeting, min_papers_per_reviewer=n_meeting, max_papers_per_reviewer=n_meeting ) x_sol = linprog(v, K, d)['x'] b = create_assignment(x_sol, A) output = [] for i in range(len(b)): r = [list(person_df['PersonID'])[b_] for b_ in np.nonzero(b[i])[0]] output.append([list(person_df.PersonID)[i], r]) # make optimal schedule schedule = nest_answer(output, format_answer(color_graph(build_line_graph(output)))) return schedule
def assign_articles_to_reviewers(article_df, reviewer_df, people_df): """ Perform reviewer-assignment from dataframe of article, reviewer, and people Parameters ========== article_df: a dataframe that has columns `PaperID`, `Title`, `Abstract`, and `PersonIDList` where PersonIDList contains string of simicolon separated list of PersonID reviewer_df: a dataframe that has columns `PersonID` and `Abstract` people_df: dataframe that has columns `PersonID`, `FullName` We assume `PersonID` is an integer Output ====== article_assignment_df: an assigned reviewers dataframe, each row of article will have list of reviewers in `ReviewerIDList` column and their name in reviewer_names """ papers = list( (article_df['Title'] + ' ' + article_df['Abstract']).map(preprocess)) reviewers = list(reviewer_df['Abstract'].map(preprocess)) # Calculate conflict of interest based on co-authors coauthors_df = pd.DataFrame( [[int(r.PaperID), int(co_author)] for _, r in article_df.iterrows() for co_author in r.PersonIDList.split(';')], columns=['PaperID', 'PersonID']) article_df['paper_id'] = list(range(len(article_df))) reviewer_df['person_id'] = list(range(len(reviewer_df))) coi_df = coauthors_df.merge(article_df[['PaperID', 'paper_id']], on='PaperID').merge( reviewer_df[['PersonID', 'person_id']], on='PersonID')[['paper_id', 'person_id']] # calculate affinity matrix A = affinity_computation(papers, reviewers, n_components=10, min_df=2, max_df=0.8, weighting='tfidf', projection='pca') # trim distance that are too high A_trim = [] for r in range(len(A)): a = A[r, :] a[np.argsort(a)[0:200]] = 0 A_trim.append(a) A_trim = np.vstack(A_trim) # assign conflict of interest to have high negative cost for i, j in zip(coi_df.paper_id.tolist(), coi_df.person_id.tolist()): A_trim[i, j] = -1000 # for CCN case, v, K, d = create_lp_matrix(A_trim, min_reviewers_per_paper=6, max_reviewers_per_paper=6, min_papers_per_reviewer=4, max_papers_per_reviewer=6) x_sol = linprog(v, K, d)['x'] b = create_assignment(x_sol, A_trim) reviewer_ids = list(reviewer_df.PersonID) reviewer_name_dict = { r['PersonID']: r['FullName'] for _, r in people_df.iterrows() } # map reviewer id to reviewer name assignments = [] for i in range(len(b)): assignments.append([ i, [reviewer_ids[b_] for b_ in np.nonzero(b[i])[0]], [ reviewer_name_dict[reviewer_ids[b_]] for b_ in np.nonzero(b[i])[0] ] ]) assignments_df = pd.DataFrame( assignments, columns=['paper_id', 'ReviewerIDList', 'reviewer_names']) assignments_df['ReviewerIDList'] = assignments_df.ReviewerIDList.map( lambda e: ';'.join(str(e_) for e_ in e)) assignments_df['reviewer_names'] = assignments_df.reviewer_names.map( lambda x: ';'.join(x)) article_assignment_df = article_df.merge(assignments_df, on='paper_id').drop('paper_id', axis=1) return article_assignment_df
) else: n_trim = int(n_trim) print('Trimming parameter is set to {}'.format(n_trim)) output_filename = arguments.get('output') if output_filename is None: output_filename = 'output_match.csv' # create assignment matrix persons_1 = list(map(preprocess, list(df['abstracts']))) persons_2 = list(map(preprocess, list(df['abstracts']))) A = affinity_computation(persons_1, persons_2, n_components=30, min_df=3, max_df=0.85, weighting='tfidf', projection='pca') A[np.arange(len(A)), np.arange(len(A) )] = -1000 # set diagonal to prevent matching with themselve print('Compute conflicts... (this may take a bit)') cois = compute_conflicts(df) A[cois] = -1000 print('Done computing conflicts!') # trimming affinity matrix to reduce the problem size if n_trim != 0: A_trim = []
columns={'institute_longitude': 'lng', 'institute_latitude': 'lat'} ) D_lat_lng = [] for _, r1 in lat_lng_df.iterrows(): D_lat_lng.append([ calculate_geo_distance(r1.lat, r1.lng, r2.lat, r2.lng) for _, r2 in lat_lng_df.iterrows() ]) D_lat_lng_scale = scaler.fit_transform(D_lat_lng) D_lat_lng_scale = pd.DataFrame(D_lat_lng_scale).fillna(np.nanmean(D_lat_lng_scale)).values # calculate topic distance between statement persons_1 = list(map(preprocess, list(df['Statement']))) persons_2 = list(map(preprocess, list(df['Statement']))) D_statement = - affinity_computation(persons_1, persons_2, n_components=30, min_df=2, max_df=0.8, weighting='tfidf', projection='svd') std_topic = D_statement.std() # clustering D_final = (D_statement) + (10 * std_topic * D_tz) + (std_topic * D_lat_lng_scale) # final distance X_mds = MDS(n_components=30).fit_transform(D_final) clusters_kmean, centers_kmean = cop_kmeans(dataset=X_mds, k=200, cl=cannot_link) output_df = df[selected_cols] output_df['pod_number'] = clusters_kmean # rearrange df_rearrange = [] pod_num = 1 for _, df_tz in output_df.groupby('timezone'): for _, df_pod_num in df_tz.groupby('pod_number'):
lambda x: [x]) + reviewer_b_df['CollaboratorsList'] reviewer_df = pd.concat( (reviewer_a_df, reviewer_b_df)).reset_index(drop=True) # affinity matrix papers = list((submission_df['keywords'] + ' ' + submission_df['Title'] + ' ' + submission_df['Abstract']).map(preprocess)) reviewers_a = list((reviewer_a_df['keywords'] + ' ' + reviewer_a_df['SampleAbstract1'].fillna('') + ' ' + reviewer_a_df['SampleAbstract2'].fillna('')).map(preprocess)) reviewers_b = list((reviewer_b_df['keywords'] + ' ' + reviewer_b_df['SampleAbstract1'].fillna('') + ' ' + reviewer_b_df['SampleAbstract2'].fillna('')).map(preprocess)) A = affinity_computation(papers, reviewers_a + reviewers_b, n_components=15, min_df=2, max_df=0.85, weighting='tfidf', projection='pca') # COIs cois_ids = submission_df.AuthorIds.map( lambda x: create_coi_author_ids(x, reviewer_df)) cois = submission_df.AuthorsList.map( lambda x: create_coi_list(x, reviewer_df)) cois_df = pd.DataFrame(cois + cois_ids, columns=['AuthorsList']) for i, r in cois_df.iterrows(): if len(r['AuthorsList']) > 0: for idx in r['AuthorsList']: A[i, idx] = -1000 # assignment A_a, A_b = A[:, :len(reviewer_a_df)], A[:, len(reviewer_a_df):]