def create_dating_schedule(person_df, n_meeting=10):
    """
    Function to create speed dating schedule at CCN 2018 conference

    Parameters
    ==========
    person_df: pandas dataframe contains - PersonID, FullName, Abstract
    n_meeting: int, number of meeting we would like to have

    Output
    ======
    schedule: list, list of person id and person ids to meet in the 
        following format: [PersonID, [PersonID to meet]]
    """
    # linear programming
    persons_1 = list(map(preprocess, list(person_df['Abstract'])))
    persons_2 = list(map(preprocess, list(person_df['Abstract'])))

    A = affinity_computation(persons_1,
                             persons_2,
                             n_components=10,
                             min_df=1,
                             max_df=0.8,
                             weighting='tfidf',
                             projection='pca')
    # constraints, conflict of interest
    A[np.arange(len(A)), np.arange(len(A))] = -1000

    # for dating at CCN
    v, K, d = create_lp_matrix(A,
                               min_reviewers_per_paper=n_meeting,
                               max_reviewers_per_paper=n_meeting,
                               min_papers_per_reviewer=n_meeting,
                               max_papers_per_reviewer=n_meeting)
    x_sol = linprog(v, K, d)['x']
    b = create_assignment(x_sol, A)

    output = []
    for i in range(len(b)):
        r = [list(person_df['PersonID'])[b_] for b_ in np.nonzero(b[i])[0]]
        output.append([list(person_df.PersonID)[i], r])

    # make optimal schedule
    schedule = nest_answer(
        output, format_answer(color_graph(build_line_graph(output))))

    return schedule
def create_dating_schedule(person_df, n_meeting=10):
    """
    Function to create speed dating schedule at CCN 2018 conference

    Parameters
    ==========
    person_df: pandas dataframe contains - PersonID, FullName, Abstract
    n_meeting: int, number of meeting we would like to have

    Output
    ======
    schedule: list, list of person id and person ids to meet in the 
        following format: [PersonID, [PersonID to meet]]
    """
    # linear programming
    persons_1 = list(map(preprocess, list(person_df['Abstract'])))
    persons_2 = list(map(preprocess, list(person_df['Abstract'])))

    A = affinity_computation(persons_1, persons_2,
                             n_components=10, min_df=1, max_df=0.8,
                             weighting='tfidf', projection='pca')
    # constraints, conflict of interest
    A[np.arange(len(A)), np.arange(len(A))] = -1000

    # for dating at CCN
    v, K, d = create_lp_matrix(
        A, 
        min_reviewers_per_paper=n_meeting, max_reviewers_per_paper=n_meeting,
        min_papers_per_reviewer=n_meeting, max_papers_per_reviewer=n_meeting
    )
    x_sol = linprog(v, K, d)['x']
    b = create_assignment(x_sol, A)

    output = []
    for i in range(len(b)):
        r = [list(person_df['PersonID'])[b_] for b_ in np.nonzero(b[i])[0]]
        output.append([list(person_df.PersonID)[i], r])

    # make optimal schedule
    schedule = nest_answer(output, format_answer(color_graph(build_line_graph(output))))

    return schedule
def assign_articles_to_reviewers(article_df, reviewer_df, people_df):
    """
    Perform reviewer-assignment from dataframe of article, reviewer, and people

    Parameters
    ==========
    article_df: a dataframe that has columns `PaperID`, `Title`, `Abstract`, and `PersonIDList`
        where PersonIDList contains string of simicolon separated list of PersonID
    reviewer_df: a dataframe that has columns `PersonID` and `Abstract`
    people_df:  dataframe that has columns `PersonID`, `FullName`

    We assume `PersonID` is an integer

    Output
    ======
    article_assignment_df: an assigned reviewers dataframe, each row of article will have 
        list of reviewers in `ReviewerIDList` column and their name in reviewer_names
    """
    papers = list(
        (article_df['Title'] + ' ' + article_df['Abstract']).map(preprocess))
    reviewers = list(reviewer_df['Abstract'].map(preprocess))

    # Calculate conflict of interest based on co-authors
    coauthors_df = pd.DataFrame(
        [[int(r.PaperID), int(co_author)] for _, r in article_df.iterrows()
         for co_author in r.PersonIDList.split(';')],
        columns=['PaperID', 'PersonID'])
    article_df['paper_id'] = list(range(len(article_df)))
    reviewer_df['person_id'] = list(range(len(reviewer_df)))
    coi_df = coauthors_df.merge(article_df[['PaperID', 'paper_id']],
                                on='PaperID').merge(
                                    reviewer_df[['PersonID', 'person_id']],
                                    on='PersonID')[['paper_id', 'person_id']]

    # calculate affinity matrix
    A = affinity_computation(papers,
                             reviewers,
                             n_components=10,
                             min_df=2,
                             max_df=0.8,
                             weighting='tfidf',
                             projection='pca')

    # trim distance that are too high
    A_trim = []
    for r in range(len(A)):
        a = A[r, :]
        a[np.argsort(a)[0:200]] = 0
        A_trim.append(a)
    A_trim = np.vstack(A_trim)

    # assign conflict of interest to have high negative cost
    for i, j in zip(coi_df.paper_id.tolist(), coi_df.person_id.tolist()):
        A_trim[i, j] = -1000

    # for CCN case,
    v, K, d = create_lp_matrix(A_trim,
                               min_reviewers_per_paper=6,
                               max_reviewers_per_paper=6,
                               min_papers_per_reviewer=4,
                               max_papers_per_reviewer=6)
    x_sol = linprog(v, K, d)['x']
    b = create_assignment(x_sol, A_trim)
    reviewer_ids = list(reviewer_df.PersonID)
    reviewer_name_dict = {
        r['PersonID']: r['FullName']
        for _, r in people_df.iterrows()
    }  # map reviewer id to reviewer name
    assignments = []
    for i in range(len(b)):
        assignments.append([
            i, [reviewer_ids[b_] for b_ in np.nonzero(b[i])[0]],
            [
                reviewer_name_dict[reviewer_ids[b_]]
                for b_ in np.nonzero(b[i])[0]
            ]
        ])
    assignments_df = pd.DataFrame(
        assignments, columns=['paper_id', 'ReviewerIDList', 'reviewer_names'])
    assignments_df['ReviewerIDList'] = assignments_df.ReviewerIDList.map(
        lambda e: ';'.join(str(e_) for e_ in e))
    assignments_df['reviewer_names'] = assignments_df.reviewer_names.map(
        lambda x: ';'.join(x))
    article_assignment_df = article_df.merge(assignments_df,
                                             on='paper_id').drop('paper_id',
                                                                 axis=1)
    return article_assignment_df
Ejemplo n.º 4
0
        for r in range(len(A)):
            a = A[r, :]
            a[np.argsort(a)[0:n_trim]] = 0
            A_trim.append(a)
        A_trim = np.vstack(A_trim)
    else:
        A_trim = A

    print('Solving a matching problem...')
    v, K, d = create_lp_matrix(A_trim,
                               min_reviewers_per_paper=n_match,
                               max_reviewers_per_paper=n_match,
                               min_papers_per_reviewer=n_match,
                               max_papers_per_reviewer=n_match)
    x_sol = linprog(v, K, d)['x']
    b = create_assignment(x_sol, A_trim)
    if (b.sum() == 0):
        print(
            'Seems like the problem does not converge, try reducing <n_trim> but not too low!'
        )
    else:
        print('Successfully assigned all the match!')

    if (b.sum() != 0):
        output = []
        user_ids_map = {ri: r['user_id'] for ri, r in df.iterrows()}
        for i in range(len(b)):
            match_ids = [str(user_ids_map[b_]) for b_ in np.nonzero(b[i])[0]]
            output.append({
                'user_id': user_ids_map[i],
                'match_ids': ';'.join(match_ids)
Ejemplo n.º 5
0
        lambda x: create_coi_author_ids(x, reviewer_df))
    cois = submission_df.AuthorsList.map(
        lambda x: create_coi_list(x, reviewer_df))
    cois_df = pd.DataFrame(cois + cois_ids, columns=['AuthorsList'])
    for i, r in cois_df.iterrows():
        if len(r['AuthorsList']) > 0:
            for idx in r['AuthorsList']:
                A[i, idx] = -1000

    # assignment
    A_a, A_b = A[:, :len(reviewer_a_df)], A[:, len(reviewer_a_df):]
    v, K, d = create_lp_matrix(A_a,
                               min_reviewers_per_paper=2, max_reviewers_per_paper=2,
                               min_papers_per_reviewer=10, max_papers_per_reviewer=12)
    x_sol = linprog(v, K, d)['x']
    b_a = create_assignment(x_sol, A_a)

    v, K, d = create_lp_matrix(A_b,
                               min_reviewers_per_paper=2, max_reviewers_per_paper=2,
                               min_papers_per_reviewer=10, max_papers_per_reviewer=12)
    x_sol = linprog(v, K, d)['x']
    b_b = create_assignment(x_sol, A_b)

    reviewer_a_map = {i: r['UserID'] for i, r in reviewer_a_df.iterrows()}
    reviewer_b_map = {i: r['UserID'] for i, r in reviewer_b_df.iterrows()}
    paper_id_map = {i: r['PaperID'] for i, r in submission_df.iterrows()}

    assignments_a_df = create_assignment_dataframe(b_a, reviewer_a_map,
                                                   paper_id_map,
                                                   pool_group='a')
    assignments_b_df = create_assignment_dataframe(b_b, reviewer_b_map,
Ejemplo n.º 6
0
    def calculate_result(self, reviewer_data, article_data, people_data,
                         coi_data,
                         min_rev_art, max_rev_art, min_art_rev, max_art_rev):
        """
        Generates a csv file with the resulting assignment while it updates the status
        of the process using Celery
        """

        cur_progress = 0
        max_progress = 100

        article_data = pd.DataFrame(article_data)
        people_data = pd.DataFrame(people_data)
        coauthors_df = pd.DataFrame([[r.PaperID, co_author]
                                     for _, r in article_data.iterrows()
                                     for co_author in r.PersonIDList.split(';')],
                                    columns = ['PaperID', 'PersonID'])

        if reviewer_data is None:
            # extract reviewer data from articles
            coauthor_articles = coauthors_df.merge(article_data)[['PersonID', 'Abstract']]
            coauthor_abtracts = coauthor_articles.groupby('PersonID').\
                                                  agg({'Abstract': lambda x: ''.join(x)})
            reviewer_data = pd.DataFrame(zip(coauthor_abtracts.index,
                                             coauthor_abtracts.Abstract),
                                         columns=['PersonID', 'Abstract'])
        else:
            reviewer_data = pd.DataFrame(reviewer_data)
            reviewer_data.PersonID = reviewer_data.PersonID.apply(str)

        if coi_data is not None:
            coi_data = pd.DataFrame(coi_data)

        update_frequency = 1
        cur_progress += int(max_progress/6.)
        self.update_progress(
                cur_progress,
                max_progress,
                update_frequency=update_frequency,
            )


        # this performs the topic modeling (LSA)
        a = prm.compute_affinity(reviewer_data.Abstract, article_data.Abstract)
        cur_progress += int(max_progress/6.)
        self.update_progress(
                cur_progress,
                max_progress,
                update_frequency=update_frequency,
            )

        # if coi_data available, then add as if they were co-authors
        if coi_data is not None:
            coi_data.PersonID = coi_data.PersonID.apply(str)
            coauthors_df = pd.concat((coauthors_df, coi_data))


        # articles
        article_data2 = article_data.copy()
        article_data2.index = article_data2.PaperID
        article_data2['id'] = range(article_data2.shape[0])
        coi_row = np.array(article_data2.loc[coauthors_df.PaperID].id.tolist())

        # persons
        reviewer_data2 = reviewer_data.copy()
        reviewer_data2.index = reviewer_data2.PersonID
        reviewer_data2['id'] = range(reviewer_data2.shape[0])
        coi_column = np.array(reviewer_data2.loc[coauthors_df.PersonID].id.tolist())

        for i, j in zip(coi_row, coi_column):
            a[i, j] = -1000.#np.inf

        v, A, d = prm.create_lp_matrices(a, min_rev_art, max_rev_art,
                                          min_art_rev, max_art_rev)
        v = v.flatten()
        d = d.flatten()

        cur_progress += int(max_progress/6.)
        self.update_progress(
                cur_progress,
                max_progress,
                update_frequency=update_frequency,
            )

        solver = pywraplp.Solver('SolveReviewerAssignment',
                                 pywraplp.Solver.GLOP_LINEAR_PROGRAMMING)
        infinity = solver.Infinity()
        n, m = A.shape
        x = [[]]*m
        c = [0]*n

        for j in range(m):
            x[j] = solver.NumVar(-infinity, infinity, 'x_%u' % j)

        # state objective function
        objective = solver.Objective()
        for j in range(m):
            objective.SetCoefficient(x[j], v[j])
        objective.SetMaximization()

        # state the constraints
        for i in range(n):
            c[i] = solver.Constraint(-infinity, d[i])

            # update status bar
            if np.mod(i, int(n/10)) == 0:
                cur_progress += 3
                self.update_progress(
                    cur_progress,
                    max_progress,
                    update_frequency=update_frequency,
                )

            for j in A.col[A.row == i]:
                c[i].SetCoefficient(x[j], A.data[np.logical_and(A.row == i, A.col == j)][0])

        result_status = solver.Solve()
        if result_status != 0:
            print "The final solution might not converged"

        x_sol = np.array([x_tmp.SolutionValue() for x_tmp in x])

        #x = prm.linprog_solve(v, ne, d)
        x_sol = (x_sol > 0.5)

        cur_progress += int(max_progress/6.)
        self.update_progress(
                4*int(max_progress/6.),
                max_progress,
                update_frequency=update_frequency,
            )

        b = prm.create_assignment(x_sol, a)
        self.update_progress(
                5*int(max_progress/6.),
                max_progress,
                update_frequency=update_frequency,
            )

        assignment_df = article_data[['PaperID', 'Title']]
        assignment_df['Reviewers'] = ''
        assignment_df['ReviewerIDs'] = ''
        for i in range(b.shape[0]):
            paper_reviewers = np.where(b[i, :])[0]
            assignment_df.Reviewers.iloc[i] = ', '.join(list(people_data.FullName.iloc[paper_reviewers].copy()))
            # assignment_df.ReviewerIDs.iloc[i] = ', '.join(list(people_data.PersonID.iloc[paper_reviewers].copy()))
        self.update_progress(
                6*int(max_progress/6.),
                max_progress,
                update_frequency=update_frequency,
        )

        # transform to ascii
        assignment_df.Title.apply(lambda x: unicode(x))
        assignment_df.Reviewers.apply(lambda x: unicode(x))

        # , 'result': assignment_df.to_csv(None, na_rep='', index=False)
        # return {'task': {'status': 'SUCCESS'}}
        return assignment_df.to_csv(None, na_rep='', index=False, encoding='utf-8')
Ejemplo n.º 7
0
    def calculate_result(self, reviewer_data, article_data, people_data,
                         coi_data, min_rev_art, max_rev_art, min_art_rev,
                         max_art_rev):
        """
        Generates a csv file with the resulting assignment while it updates the status
        of the process using Celery
        """

        cur_progress = 0
        max_progress = 100

        article_data = pd.DataFrame(article_data)
        people_data = pd.DataFrame(people_data)
        coauthors_df = pd.DataFrame(
            [[r.PaperID, co_author] for _, r in article_data.iterrows()
             for co_author in r.PersonIDList.split(';')],
            columns=['PaperID', 'PersonID'])

        if reviewer_data is None:
            # extract reviewer data from articles
            coauthor_articles = coauthors_df.merge(article_data)[[
                'PersonID', 'Abstract'
            ]]
            coauthor_abtracts = coauthor_articles.groupby('PersonID').\
                                                  agg({'Abstract': lambda x: ''.join(x)})
            reviewer_data = pd.DataFrame(zip(coauthor_abtracts.index,
                                             coauthor_abtracts.Abstract),
                                         columns=['PersonID', 'Abstract'])
        else:
            reviewer_data = pd.DataFrame(reviewer_data)
            reviewer_data.PersonID = reviewer_data.PersonID.apply(str)

        if coi_data is not None:
            coi_data = pd.DataFrame(coi_data)

        update_frequency = 1
        cur_progress += int(max_progress / 6.)
        self.update_progress(
            cur_progress,
            max_progress,
            update_frequency=update_frequency,
        )

        # this performs the topic modeling (LSA)
        a = prm.compute_affinity(reviewer_data.Abstract, article_data.Abstract)
        cur_progress += int(max_progress / 6.)
        self.update_progress(
            cur_progress,
            max_progress,
            update_frequency=update_frequency,
        )

        # if coi_data available, then add as if they were co-authors
        if coi_data is not None:
            coi_data.PersonID = coi_data.PersonID.apply(str)
            coauthors_df = pd.concat((coauthors_df, coi_data))

        # articles
        article_data2 = article_data.copy()
        article_data2.index = article_data2.PaperID
        article_data2['id'] = range(article_data2.shape[0])
        coi_row = np.array(article_data2.loc[coauthors_df.PaperID].id.tolist())

        # persons
        reviewer_data2 = reviewer_data.copy()
        reviewer_data2.index = reviewer_data2.PersonID
        reviewer_data2['id'] = range(reviewer_data2.shape[0])
        coi_column = np.array(
            reviewer_data2.loc[coauthors_df.PersonID].id.tolist())

        for i, j in zip(coi_row, coi_column):
            a[i, j] = -1000.  #np.inf

        v, A, d = prm.create_lp_matrices(a, min_rev_art, max_rev_art,
                                         min_art_rev, max_art_rev)
        v = v.flatten()
        d = d.flatten()

        cur_progress += int(max_progress / 6.)
        self.update_progress(
            cur_progress,
            max_progress,
            update_frequency=update_frequency,
        )

        solver = pywraplp.Solver('SolveReviewerAssignment',
                                 pywraplp.Solver.GLOP_LINEAR_PROGRAMMING)
        infinity = solver.Infinity()
        n, m = A.shape
        x = [[]] * m
        c = [0] * n

        for j in range(m):
            x[j] = solver.NumVar(-infinity, infinity, 'x_%u' % j)

        # state objective function
        objective = solver.Objective()
        for j in range(m):
            objective.SetCoefficient(x[j], v[j])
        objective.SetMaximization()

        # state the constraints
        for i in range(n):
            c[i] = solver.Constraint(-infinity, d[i])

            # update status bar
            if np.mod(i, int(n / 10)) == 0:
                cur_progress += 3
                self.update_progress(
                    cur_progress,
                    max_progress,
                    update_frequency=update_frequency,
                )

            for j in A.col[A.row == i]:
                c[i].SetCoefficient(
                    x[j], A.data[np.logical_and(A.row == i, A.col == j)][0])

        result_status = solver.Solve()
        if result_status != 0:
            print "The final solution might not converged"

        x_sol = np.array([x_tmp.SolutionValue() for x_tmp in x])

        #x = prm.linprog_solve(v, ne, d)
        x_sol = (x_sol > 0.5)

        cur_progress += int(max_progress / 6.)
        self.update_progress(
            4 * int(max_progress / 6.),
            max_progress,
            update_frequency=update_frequency,
        )

        b = prm.create_assignment(x_sol, a)
        self.update_progress(
            5 * int(max_progress / 6.),
            max_progress,
            update_frequency=update_frequency,
        )

        assignment_df = article_data[['PaperID', 'Title']]
        assignment_df['Reviewers'] = ''
        assignment_df['ReviewerIDs'] = ''
        for i in range(b.shape[0]):
            paper_reviewers = np.where(b[i, :])[0]
            assignment_df.Reviewers.iloc[i] = ', '.join(
                list(people_data.FullName.iloc[paper_reviewers].copy()))
            # assignment_df.ReviewerIDs.iloc[i] = ', '.join(list(people_data.PersonID.iloc[paper_reviewers].copy()))
        self.update_progress(
            6 * int(max_progress / 6.),
            max_progress,
            update_frequency=update_frequency,
        )

        # transform to ascii
        assignment_df.Title.apply(lambda x: unicode(x))
        assignment_df.Reviewers.apply(lambda x: unicode(x))

        # , 'result': assignment_df.to_csv(None, na_rep='', index=False)
        # return {'task': {'status': 'SUCCESS'}}
        return assignment_df.to_csv(None,
                                    na_rep='',
                                    index=False,
                                    encoding='utf-8')