def create_dating_schedule(person_df, n_meeting=10):
    """
    Function to create speed dating schedule at CCN 2018 conference

    Parameters
    ==========
    person_df: pandas dataframe contains - PersonID, FullName, Abstract
    n_meeting: int, number of meeting we would like to have

    Output
    ======
    schedule: list, list of person id and person ids to meet in the 
        following format: [PersonID, [PersonID to meet]]
    """
    # linear programming
    persons_1 = list(map(preprocess, list(person_df['Abstract'])))
    persons_2 = list(map(preprocess, list(person_df['Abstract'])))

    A = affinity_computation(persons_1,
                             persons_2,
                             n_components=10,
                             min_df=1,
                             max_df=0.8,
                             weighting='tfidf',
                             projection='pca')
    # constraints, conflict of interest
    A[np.arange(len(A)), np.arange(len(A))] = -1000

    # for dating at CCN
    v, K, d = create_lp_matrix(A,
                               min_reviewers_per_paper=n_meeting,
                               max_reviewers_per_paper=n_meeting,
                               min_papers_per_reviewer=n_meeting,
                               max_papers_per_reviewer=n_meeting)
    x_sol = linprog(v, K, d)['x']
    b = create_assignment(x_sol, A)

    output = []
    for i in range(len(b)):
        r = [list(person_df['PersonID'])[b_] for b_ in np.nonzero(b[i])[0]]
        output.append([list(person_df.PersonID)[i], r])

    # make optimal schedule
    schedule = nest_answer(
        output, format_answer(color_graph(build_line_graph(output))))

    return schedule
def create_dating_schedule(person_df, n_meeting=10):
    """
    Function to create speed dating schedule at CCN 2018 conference

    Parameters
    ==========
    person_df: pandas dataframe contains - PersonID, FullName, Abstract
    n_meeting: int, number of meeting we would like to have

    Output
    ======
    schedule: list, list of person id and person ids to meet in the 
        following format: [PersonID, [PersonID to meet]]
    """
    # linear programming
    persons_1 = list(map(preprocess, list(person_df['Abstract'])))
    persons_2 = list(map(preprocess, list(person_df['Abstract'])))

    A = affinity_computation(persons_1, persons_2,
                             n_components=10, min_df=1, max_df=0.8,
                             weighting='tfidf', projection='pca')
    # constraints, conflict of interest
    A[np.arange(len(A)), np.arange(len(A))] = -1000

    # for dating at CCN
    v, K, d = create_lp_matrix(
        A, 
        min_reviewers_per_paper=n_meeting, max_reviewers_per_paper=n_meeting,
        min_papers_per_reviewer=n_meeting, max_papers_per_reviewer=n_meeting
    )
    x_sol = linprog(v, K, d)['x']
    b = create_assignment(x_sol, A)

    output = []
    for i in range(len(b)):
        r = [list(person_df['PersonID'])[b_] for b_ in np.nonzero(b[i])[0]]
        output.append([list(person_df.PersonID)[i], r])

    # make optimal schedule
    schedule = nest_answer(output, format_answer(color_graph(build_line_graph(output))))

    return schedule
def assign_articles_to_reviewers(article_df, reviewer_df, people_df):
    """
    Perform reviewer-assignment from dataframe of article, reviewer, and people

    Parameters
    ==========
    article_df: a dataframe that has columns `PaperID`, `Title`, `Abstract`, and `PersonIDList`
        where PersonIDList contains string of simicolon separated list of PersonID
    reviewer_df: a dataframe that has columns `PersonID` and `Abstract`
    people_df:  dataframe that has columns `PersonID`, `FullName`

    We assume `PersonID` is an integer

    Output
    ======
    article_assignment_df: an assigned reviewers dataframe, each row of article will have 
        list of reviewers in `ReviewerIDList` column and their name in reviewer_names
    """
    papers = list(
        (article_df['Title'] + ' ' + article_df['Abstract']).map(preprocess))
    reviewers = list(reviewer_df['Abstract'].map(preprocess))

    # Calculate conflict of interest based on co-authors
    coauthors_df = pd.DataFrame(
        [[int(r.PaperID), int(co_author)] for _, r in article_df.iterrows()
         for co_author in r.PersonIDList.split(';')],
        columns=['PaperID', 'PersonID'])
    article_df['paper_id'] = list(range(len(article_df)))
    reviewer_df['person_id'] = list(range(len(reviewer_df)))
    coi_df = coauthors_df.merge(article_df[['PaperID', 'paper_id']],
                                on='PaperID').merge(
                                    reviewer_df[['PersonID', 'person_id']],
                                    on='PersonID')[['paper_id', 'person_id']]

    # calculate affinity matrix
    A = affinity_computation(papers,
                             reviewers,
                             n_components=10,
                             min_df=2,
                             max_df=0.8,
                             weighting='tfidf',
                             projection='pca')

    # trim distance that are too high
    A_trim = []
    for r in range(len(A)):
        a = A[r, :]
        a[np.argsort(a)[0:200]] = 0
        A_trim.append(a)
    A_trim = np.vstack(A_trim)

    # assign conflict of interest to have high negative cost
    for i, j in zip(coi_df.paper_id.tolist(), coi_df.person_id.tolist()):
        A_trim[i, j] = -1000

    # for CCN case,
    v, K, d = create_lp_matrix(A_trim,
                               min_reviewers_per_paper=6,
                               max_reviewers_per_paper=6,
                               min_papers_per_reviewer=4,
                               max_papers_per_reviewer=6)
    x_sol = linprog(v, K, d)['x']
    b = create_assignment(x_sol, A_trim)
    reviewer_ids = list(reviewer_df.PersonID)
    reviewer_name_dict = {
        r['PersonID']: r['FullName']
        for _, r in people_df.iterrows()
    }  # map reviewer id to reviewer name
    assignments = []
    for i in range(len(b)):
        assignments.append([
            i, [reviewer_ids[b_] for b_ in np.nonzero(b[i])[0]],
            [
                reviewer_name_dict[reviewer_ids[b_]]
                for b_ in np.nonzero(b[i])[0]
            ]
        ])
    assignments_df = pd.DataFrame(
        assignments, columns=['paper_id', 'ReviewerIDList', 'reviewer_names'])
    assignments_df['ReviewerIDList'] = assignments_df.ReviewerIDList.map(
        lambda e: ';'.join(str(e_) for e_ in e))
    assignments_df['reviewer_names'] = assignments_df.reviewer_names.map(
        lambda x: ';'.join(x))
    article_assignment_df = article_df.merge(assignments_df,
                                             on='paper_id').drop('paper_id',
                                                                 axis=1)
    return article_assignment_df
Exemple #4
0
        )
    else:
        n_trim = int(n_trim)
        print('Trimming parameter is set to {}'.format(n_trim))

    output_filename = arguments.get('output')
    if output_filename is None:
        output_filename = 'output_match.csv'

    # create assignment matrix
    persons_1 = list(map(preprocess, list(df['abstracts'])))
    persons_2 = list(map(preprocess, list(df['abstracts'])))
    A = affinity_computation(persons_1,
                             persons_2,
                             n_components=30,
                             min_df=3,
                             max_df=0.85,
                             weighting='tfidf',
                             projection='pca')
    A[np.arange(len(A)),
      np.arange(len(A)
                )] = -1000  # set diagonal to prevent matching with themselve

    print('Compute conflicts... (this may take a bit)')
    cois = compute_conflicts(df)
    A[cois] = -1000
    print('Done computing conflicts!')

    # trimming affinity matrix to reduce the problem size
    if n_trim != 0:
        A_trim = []
Exemple #5
0
        columns={'institute_longitude': 'lng', 'institute_latitude': 'lat'}
    )
    D_lat_lng = []
    for _, r1 in lat_lng_df.iterrows():
        D_lat_lng.append([
            calculate_geo_distance(r1.lat, r1.lng, r2.lat, r2.lng)
                                    for _, r2 in lat_lng_df.iterrows()
        ])
    D_lat_lng_scale = scaler.fit_transform(D_lat_lng)
    D_lat_lng_scale = pd.DataFrame(D_lat_lng_scale).fillna(np.nanmean(D_lat_lng_scale)).values

    # calculate topic distance between statement
    persons_1 = list(map(preprocess, list(df['Statement'])))
    persons_2 = list(map(preprocess, list(df['Statement'])))
    D_statement = - affinity_computation(persons_1, persons_2,
                                        n_components=30, min_df=2, max_df=0.8,
                                        weighting='tfidf', projection='svd')
    std_topic = D_statement.std()

    # clustering
    D_final = (D_statement) + (10 * std_topic * D_tz) + (std_topic * D_lat_lng_scale) # final distance
    X_mds = MDS(n_components=30).fit_transform(D_final)
    clusters_kmean, centers_kmean = cop_kmeans(dataset=X_mds, k=200, cl=cannot_link)
    output_df = df[selected_cols]
    output_df['pod_number'] = clusters_kmean

    # rearrange
    df_rearrange = []
    pod_num = 1
    for _, df_tz in output_df.groupby('timezone'):
        for _, df_pod_num in df_tz.groupby('pod_number'):
Exemple #6
0
        lambda x: [x]) + reviewer_b_df['CollaboratorsList']
    reviewer_df = pd.concat(
        (reviewer_a_df, reviewer_b_df)).reset_index(drop=True)

    # affinity matrix
    papers = list((submission_df['keywords'] +
                   ' ' + submission_df['Title'] +
                   ' ' + submission_df['Abstract']).map(preprocess))
    reviewers_a = list((reviewer_a_df['keywords'] +
                        ' ' + reviewer_a_df['SampleAbstract1'].fillna('') +
                        ' ' + reviewer_a_df['SampleAbstract2'].fillna('')).map(preprocess))
    reviewers_b = list((reviewer_b_df['keywords'] +
                        ' ' + reviewer_b_df['SampleAbstract1'].fillna('') +
                        ' ' + reviewer_b_df['SampleAbstract2'].fillna('')).map(preprocess))
    A = affinity_computation(papers, reviewers_a + reviewers_b,
                             n_components=15, min_df=2, max_df=0.85,
                             weighting='tfidf', projection='pca')

    # COIs
    cois_ids = submission_df.AuthorIds.map(
        lambda x: create_coi_author_ids(x, reviewer_df))
    cois = submission_df.AuthorsList.map(
        lambda x: create_coi_list(x, reviewer_df))
    cois_df = pd.DataFrame(cois + cois_ids, columns=['AuthorsList'])
    for i, r in cois_df.iterrows():
        if len(r['AuthorsList']) > 0:
            for idx in r['AuthorsList']:
                A[i, idx] = -1000

    # assignment
    A_a, A_b = A[:, :len(reviewer_a_df)], A[:, len(reviewer_a_df):]