def random_algorithm(user, candidate_news, num_timestep=1000, top_N=1): ''' Random algorithm ''' history = {} history["clicked"] = [] history["shown"] = [] candidate_pks = set(list(candidate_news.keys())) for _ in range(num_timestep): top_N_pks = random_select(candidate_pks, top_N) selected_news = candidate_news[top_N_pks[0]] flag = user_interaction(user, selected_news, ranked=False) if flag: history["clicked"].append(selected_news) history["shown"].append(selected_news) candidate_pks.remove(selected_news["article_id"]) return history
def oracle_algorithm(user, candidate_news, num_timestep=1000, top_N=1): ''' Oracle algorithm ''' history = {} history["clicked"] = [] history["shown"] = [] candidate_pks = set(list(candidate_news.keys())) # In here, it is gonna calculate the top ones at one time. # Then sorted by descending order. topn_pks = highest(user, candidate_news, num_timestep) for num_ite in range(num_timestep): top_N_pk = topn_pks[num_ite] selected_news = candidate_news[top_N_pk] flag = user_interaction(user, selected_news, ranked=False) if flag: history["clicked"].append(selected_news) history["shown"].append(selected_news) candidate_pks.remove(selected_news["article_id"]) return history
def random_bootstrap_v2(user_vector, issue_mapping, candidate_news, budget, seed=42): ''' Randomly choose budget articles ''' pos_pks = [] neg_pks = [] pks = list(candidate_news.keys()) np.random.shuffle(pks) for pk in pks[:budget]: s = user_interaction(user_vector, candidate_news[pk]) if s: pos_pks.append(pk) else: neg_pks.append(pk) return pos_pks, neg_pks
def cbnf_algorithm(user_vector, candidate_news, ran_idx, partisan_weights, randomness=0.0, bootstrap=700, num_rec=1000): ''' content-based algorithm ''' np.random.seed(ran_idx) history = {} issue_mapping = get_issue_mapping(candidate_news) pos_pks, neg_pks = random_bootstrap(user_vector, issue_mapping, candidate_news, bootstrap) unlabeled_pool = set(candidate_news.keys()) unlabeled_pool.difference_update(pos_pks) unlabeled_pool.difference_update(neg_pks) history["bootstrap_pos"] = pos_pks.copy() history["bootstrap_neg"] = neg_pks.copy() history["clicked"] = [] history["shown"] = [] x_train, y_train, w_train = build_train(candidate_news, pos_pks, neg_pks, partisan_weights) x_test, x_test_pk = build_test(candidate_news, unlabeled_pool) class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train) class_weights = {0: class_weights[0], 1: class_weights[1]} cls = SGDClassifier(loss="log", class_weight=class_weights) cls.fit(x_train, y_train, sample_weight=w_train) # Start to recommend for _ in range(num_rec): if np.random.rand() < randomness: top_news_idx = np.random.randint(0, len(x_test_pk)) else: y_prob = cls.predict_proba(x_test)[:, 1] top_news_idx = np.argsort(y_prob)[::-1][0] top_news_pk = x_test_pk[top_news_idx] top_news = candidate_news[x_test_pk[top_news_idx]] unlabeled_pool.remove(top_news_pk) flag = user_interaction(user_vector, top_news, ranked=True) newx = [candidate_news[top_news_pk]["feature_vector"]] newx = np.array(newx) if flag: newy = [1] pos_pks.append(top_news_pk) history["clicked"].append(top_news) else: newy = [0] neg_pks.append(top_news_pk) x_test = np.delete(x_test, top_news_idx, 0) x_test_pk.pop(top_news_idx) neww = [ partisan_weights[candidate_news[top_news_pk] ['source_partisan_score']] ] cls.partial_fit(newx, newy, sample_weight=neww) history["shown"].append(top_news) history['clf'] = cls return history
def random_bootstrap(user_vector, issue_mapping, candidate_news, budget, seed=42): ''' Randomly choose the equal number of articles from each topic ''' chosen_topic = [ "abortion", "environment", "guns", "health care", "immigration", "LGBTQ", "racism", "taxes", "technology", "trade", "trump impeachment", "us military", "us 2020 election", "welfare", ] l_topic = len(chosen_topic) size_list = [budget // l_topic for i in range(l_topic - 1)] size_list.append(budget - np.sum(size_list)) topic_mapping = {} for topic, values in issue_mapping.items(): topic_pk_list = [] for lean, pks in values.items(): topic_pk_list.extend(pks) topic_mapping[topic] = topic_pk_list pos_pks = [] neg_pks = [] overall = set() for size, topic in zip(size_list, chosen_topic): bucket = set() candidate_pks = topic_mapping[topic] np.random.shuffle(candidate_pks) while len(bucket) < size: pk = candidate_pks.pop() if pk not in overall: s = user_interaction(user_vector, candidate_news[pk]) if s: pos_pks.append(pk) else: neg_pks.append(pk) bucket.add(pk) overall.add(pk) return pos_pks, neg_pks
def cf_algorithm(users, candidate_news, random_seed, randomness, hidden_dim=40, bootstrap_size=700, num_timestep=1000): ''' collaborative-filtering algorithm ''' users_clicked_pool = {} issue_mapping = get_issue_mapping(candidate_news) # Create a dict from pk to column index # and create a dict from column index to pk pk2col = {} col2pk = {} for idx, pk in enumerate(list(candidate_news.keys())): pk2col[pk] = idx col2pk[idx] = pk history = {} for idx in range(len(users)): history[idx] = {} history[idx]['prototype'] = users[idx]['ptt'] history[idx]['shown'] = [] history[idx]['clicked'] = [] matrix = np.zeros((len(users), len(candidate_news))) # Initialize the bootstrap for each user for idx, user in sorted(users.items()): users_clicked_pool[idx] = set() pos_pks, neg_pks = random_bootstrap(user['vec'], issue_mapping, candidate_news, bootstrap_size) users_clicked_pool[idx].update(pos_pks) users_clicked_pool[idx].update(neg_pks) for pk in pos_pks: matrix[idx, pk2col[pk]] = 1 # Start to recommend, first test nmf # Add randomness parameters in here for _ in range(num_timestep): W, H, _ = non_negative_factorization(matrix, n_components=hidden_dim, init='random', random_state=random_seed, max_iter=250) new_matrix = np.matmul(W, H) for idx, row in enumerate(new_matrix): user_vector = users[idx]['vec'] if np.random.rand() < randomness: keys = list(candidate_news.keys()) np.random.shuffle(keys) for pk in keys: if pk not in users_clicked_pool[idx]: candidate_pk = pk break else: indices = np.argsort(row)[::-1] for col_idx in indices: if col2pk[col_idx] not in users_clicked_pool[idx]: candidate_pk = col2pk[col_idx] break # Remove from the pool users_clicked_pool[idx].add(candidate_pk) top_news = candidate_news[candidate_pk] flag = user_interaction(user_vector, top_news, ranked=True) top_col_idx = pk2col[candidate_pk] if flag: matrix[idx, top_col_idx] = 1 history[idx]['clicked'].append(top_news) else: assert matrix[idx, top_col_idx] == 0 history[idx]['shown'].append(top_news) return history