def create_history(traffic_cat, question_text_df): """ Create history dataframe of filters used with their frequency: Args: traffic_cat: traffic table [SessionId answers_selected Items_ProductId] question_text_df: table to link questionId to text [PropertyDefinition PropertyDefinitionId] Returns: df_history: history dataframe of filters used [QuestionId text frequency] """ # Compute the list of all the filters used in history list_filters_used = [] [ list_filters_used.append(k) for t in traffic_cat["answers_selected"] for k in t.keys() ] unique_filters = set(list_filters_used) df_history = pd.DataFrame(columns=["questionId", "text", "frequency"]) total_freq = 0 for f in unique_filters: question_text = question_id_to_text(f, question_text_df) if not question_text == 'No text equivalent for question': freq = list_filters_used.count(f) total_freq += freq df_history.loc[len(df_history)] = [f, question_text, freq] df_history["frequency"] = df_history["frequency"] / total_freq return df_history
def random_baseline(product_set, traffic_set, purchased_set, question_text_df, answer_text_df, threshold, y, answers_y): """Random baseline algorithm. Note: at each timestep sample randomly one question among the remaining questions. Args: product_set: the initial set product available (catalog) traffic_set: the initial traffic dataset purchased_set: the initial purchased dataset question_text_df: dataframe containing the correspondance between questionIds and string. answer_text_df: dataframe containing the correspondance between answerIds and string. threshold: stopping criteria of the algorithm y: target product answers_y: sampled answers for target product Returns: final_question_list: final list of asked questionsIds product_set: final set of selected products y: target product final_question_text_list: final list of asked question (string) answer_text_list: final list of given answers (as string) """ question_set = set(product_set["PropertyDefinitionId"].values) quest_answer_y = answers_y final_question_list = [] final_question_text_list = [] answer_text_list = [] distinct_products = len(product_set.ProductId.unique()) # faster while not (distinct_products < threshold or len(question_set) == 0): next_question = np.random.choice(np.asarray(list(question_set)), size=1)[0] next_question = int(next_question) print("RDM: Next question is filter : {}".format(next_question)) question_text = build_answers_utils.question_id_to_text( next_question, question_text_df) print("RDM: Question is: {}".format(question_text)) final_question_list.append(int(next_question)) final_question_text_list.append(question_text) answer = quest_answer_y[int(next_question)] answer_text = build_answers_utils.answer_id_to_text( answer, next_question, answer_text_df) print("RDM: Answer given was: {}".format(answer)) print("RDM: Answer was: {}".format(answer_text)) product_set, traffic_set, purchased_set = algo_utils.select_subset( question=int(next_question), answer=answer, product_set=product_set, traffic_set=traffic_set, purchased_set=purchased_set) question_set_new = set(product_set["PropertyDefinitionId"].values) question_set = question_set_new.difference(final_question_list) distinct_products = len(product_set.ProductId.unique()) print('RDM: There are still {} products to choose from'.format( distinct_products)) return final_question_list, product_set, y, final_question_text_list, answer_text_list
def __init__(self, product_set, traffic_set, purchased_set, question_text_df, answer_text_df, threshold, filters_def_dict, type_filters): self.use = 'dagger' # load the trained model if necessary if self.use == 'dagger': run = 'default' model_dir = '../training_dagger/{}'.format(run) checkpoint_model = model_dir + '/cp.ckpt' print('Loading the latest model from {}'.format(checkpoint_model)) self.length_state = len( dagger_utils.get_onehot_state({}, filters_def_dict)) self.number_filters = len(filters_def_dict.keys()) self.model = create_model(self.number_filters, self.length_state) self.model.load_weights(checkpoint_model) self.state = {} # include all necessary data self.product_set = product_set self.traffic_set = traffic_set self.purchased_set = purchased_set self.question_text_df = question_text_df self.answer_text_df = answer_text_df self.threshold = threshold self.filters_def_dict = filters_def_dict self.type_filters = type_filters self.final_question_list = [] self.question_set = set(algo_utils.get_questions(self.product_set)) self.root = Tk() self.root.title("User Interface - Max_MI algo Test") self.mainframe = ttk.Frame(self.root) self.mainframe.grid(column=0, row=0, sticky=(N, W, E, S)) self.root.columnconfigure(0, weight=2) self.root.rowconfigure(0, weight=1) # title of the interface - questions self.title = StringVar() self.title.set("Question 1") self.titleLabel = ttk.Label(self.mainframe, textvariable=self.title, font=("Helvetica", 18)).grid(column=2, row=1, columnspan=3, sticky=(W, E)) # title of the question self.question = IntVar() # get first question if self.use == 'maxMI': self.next_question = opt_step(self.question_set, self.product_set, self.traffic_set, self.purchased_set, a_hist=1, df_history=df_history) else: self.next_question = dagger_utils.dagger_one_step( self.model, self.state, self.number_filters, self.filters_def_dict) self.question.set(self.next_question) self.question_text = StringVar() self.question_text.set( question_id_to_text(self.question.get(), question_text_df)) self.questionLabel = ttk.Label(self.mainframe, textvariable=self.question_text).grid( column=2, row=4, columnspan=3, sticky=(W, E)) # multiple choice list of answers self.answer_set = self.product_set.loc[ self.product_set["PropertyDefinitionId"] == int(self.question.get()), "answer"].drop_duplicates().values print("answer set: {}".format(self.answer_set)) print("answer set: {}".format(type(self.answer_set))) print('int(self.question.get()): {}'.format(int(self.question.get()))) self.text_answers = build_answers_utils.answer_id_to_text( self.answer_set, int(self.question.get()), self.answer_text_df) # Define the scroll bar for the question list self.yScroll = Scrollbar(self.mainframe, orient=VERTICAL) # scroll bar self.yScroll.grid(row=6, column=1, sticky=N + S) # Define the list box listbox = Listbox(self.mainframe, yscrollcommand=self.yScroll.set, selectmode='multiple') for var in self.text_answers: listbox.insert(END, var) listbox.select_set(0) self.answerList = listbox self.answerList.grid(column=2, row=6, columnspan=5, sticky=W) # Labels self.nb_product_left = len(self.product_set["ProductId"].unique()) self.nb_question_asked = 1 self.product_left = StringVar() self.product_left.set('Nb products left {}'.format( self.nb_product_left)) self.question_asked = StringVar() self.question_asked.set('Nb question asked {}'.format( self.nb_question_asked)) self.productLeftLabel = ttk.Label(self.mainframe, textvariable=self.product_left).grid( column=2, row=16, columnspan=3, sticky=(W, E)) self.questionAskedLabel = ttk.Label( self.mainframe, textvariable=self.question_asked).grid(column=2, row=17, columnspan=3, sticky=(W, E)) self.final_products = StringVar() # Main button Next question self.NextButton = ttk.Button(self.mainframe, text="Next", command=self.next).grid(column=7, row=6, sticky=W)
def next(self): """ This is the function called when you press next What does it do? 1. modify the text of the question 2. modify the list of the answers 3. update nb product left 4. update nb question asked """ # Update answer as answer selected. If no answer given, then consider as 'idk' id_values = [ self.answer_set[idx] for idx in self.answerList.curselection() ] if id_values == []: values = ['idk'] print("values: {}".format(id_values)) else: values = id_values print(self.answer_set) print(self.answerList.curselection()) print("values: {}".format(values)) self.state[self.next_question] = list(values) print(self.state) print("self.question.get(): {}".format(self.question.get())) # Updating product_set, traffic_set, purchased_set, answer_set and question set self.product_set, self.traffic_set, self.purchased_set = algo_utils.select_subset( question=self.question.get(), answer=values, product_set=self.product_set, traffic_set=self.traffic_set, purchased_set=self.purchased_set) self.question_set = set(algo_utils.get_questions(self.product_set)) self.final_question_list.append(int(self.question.get())) print("Length Product set: {}".format(len(self.product_set))) question_set_new = set(algo_utils.get_questions(self.product_set)) print("Length Question set new: {}".format(len(question_set_new))) print("Length Final question list: {}".format( len(self.final_question_list))) self.question_set = question_set_new.difference( self.final_question_list) print("Question set: {}".format(self.question_set)) # Getting next question from our algo's opt_step if self.use == 'maxMI': self.next_question = opt_step(self.question_set, self.product_set, self.traffic_set, self.purchased_set, a_hist=1, df_history=df_history) else: self.next_question = dagger_utils.dagger_one_step( self.model, self.state, self.number_filters, self.filters_def_dict) print("Next question: {}".format(self.next_question)) next_question_text = question_id_to_text(self.next_question, self.question_text_df) # Updating number of questions asked self.nb_question_asked += 1 self.question_asked.set('Nb question asked {}'.format( self.nb_question_asked)) self.title.set("Question {}".format(self.nb_question_asked)) # Updating number of products left self.nb_product_left = len(self.product_set["ProductId"].unique()) self.product_left.set('Nb products left {}'.format( self.nb_product_left)) # Updating question asked and question set self.question.set(self.next_question) self.question_text.set(next_question_text) # Getting the answers self.answer_set = self.product_set.loc[ self.product_set["PropertyDefinitionId"] == int(self.question.get()), "answer"].drop_duplicates().values self.text_answers = build_answers_utils.answer_id_to_text( self.answer_set, int(self.question.get()), self.answer_text_df) # If number of products lower than threshold, display final set of products print("Number products left: {}".format( len(self.product_set["ProductId"].drop_duplicates()))) if (len(self.product_set["ProductId"].unique()) < self.threshold) or (len(self.text_answers) == 1): print("Threshold reached") win = Toplevel(self.root) win.title('Here is what we can offer you!') self.title.set("---- Your final Product Set ----") self.titleLabel = ttk.Label(win, textvariable=self.title, font=("Helvetica", 18)).grid(column=2, row=1, columnspan=10, sticky=(W, E)) self.final_productsLabel = ttk.Label(win, text="\n".join(map(str, self.product_set['ProductId'].unique()))) \ .grid(column=2, row=4,columnspan=3, sticky=(W, E)) self.quit() return 1 # Getting new answer list try: self.answerList.selection_clear( 0, 'end') # clears selected answers IF user selected an answer print("Answers cleared") except: None listbox = Listbox(self.mainframe, yscrollcommand=self.yScroll.set, selectmode='multiple') for var in self.text_answers: listbox.insert(END, var) listbox.select_set(0) # sets the first element self.answerList = listbox self.answerList.grid(column=2, row=6, columnspan=5, sticky=W) self.answer_set = self.product_set.loc[ self.product_set["PropertyDefinitionId"] == int(float(self.question.get())), "answer"].drop_duplicates().values.astype(float)
def max_info_algorithm(product_set, traffic_set, purchased_set, question_text_df, answer_text_df, threshold, y, answers_y, a_hist=1, df_history=0, first_questions=None): """Maximan mutual information algorithm to select the best subset of questions to ask: Args: product_set: product table [ProductId, BrandId, ProductTypeId, PropertyValue, PropertyDefinitionId, PropertyDefinitionOptionId, answer] traffic_set: traffic table [SessionId, answers_selected, Items_ProductId] purchased_set: purchased table [ProductId, UserId, OrderId, SessionId, Items_ProductId, Items_ItemCount] question_text_df: table to link questionId to text [PropertyDefinition, PropertyDefinitionId] answer_text_df: table to link answerId to text [answer_id, question_id, answer_text] threshold: max length of final set of products y: product selected for the algorithm answers_y: dict of question: np.array(answers) a_hist (default = 0): parameter to determine the importance of history filters df_history (default = 0): history table [ProductId, text, frequency] first_questions (default = None): optimization step, precompute the firsts questions, create new if there are none Returns: final_question_list: sequence of questionId to ask product_set: final product list y: product chosen as input of algo final_question_text_list: sequence of questionText to ask answer_text_list: answers for each final question """ question_set = set(algo_utils.get_questions(product_set)) final_question_list = [] final_question_text_list = [] answer_text_list = [] distinct_products = product_set.ProductId.unique() print("There are {} questions we can ask".format(len(question_set))) print("There are {} possible products to choose from".format( len(distinct_products))) iter = 1 # Compute the first 3 optimized questions for IDK answers (speed-up) if first_questions is None: first_questions = [] first_question_set = question_set n_first_q = 3 print("Optimization: computing first {} questions".format(n_first_q)) for i in range(n_first_q): first_question = opt_step(first_question_set, product_set, traffic_set, purchased_set, a_hist, df_history) first_questions.append(first_question) first_question_set = first_question_set.difference( set(first_questions)) # Given we have the first 3 best questions for IDK answer # we can use them until we receive a different answer n_first_q = len(first_questions) idk = True i = 0 while (idk and i < n_first_q): next_question = first_questions[i] i += 1 print("Next question is filter : {}".format(next_question)) question_text = question_id_to_text(next_question, question_text_df) print("Question is: {}".format(question_text)) final_question_list.append(int(next_question)) final_question_text_list.append(question_text) answer = answers_y.get(next_question) if not answer == ["idk"]: idk = False answer_text = answer_id_to_text(answer, next_question, answer_text_df) print("Answer given was: {}".format(answer)) print("Answer was: {}".format(answer_text)) answer_text_list.append(answer_text) product_set, traffic_set, purchased_set = algo_utils.select_subset( question=next_question, answer=answer, product_set=product_set, traffic_set=traffic_set, purchased_set=purchased_set) question_set_new = set(product_set["PropertyDefinitionId"].values) question_set = question_set_new.difference(final_question_list) distinct_products = len(product_set.ProductId.unique()) # faster print("There are {} more questions we can ask".format( len(question_set))) print("There are {} possible products to choose from".format( distinct_products)) iter += 1 # Perform greedy step until the subset of products is smaller than a certain threshold while not (distinct_products < threshold or len(question_set) == 0): next_question = opt_step(question_set, product_set, traffic_set, purchased_set, a_hist, df_history) print("Next question is filter : {}".format(next_question)) question_text = question_id_to_text(next_question, question_text_df) print("Question is: {}".format(question_text)) final_question_list.append(int(next_question)) final_question_text_list.append(question_text) answer = answers_y.get(next_question) answer_text = answer_id_to_text(answer, next_question, answer_text_df) print("Answer given was: {}".format(answer)) print("Answer was: {}".format(answer_text)) answer_text_list.append(answer_text) product_set, traffic_set, purchased_set = algo_utils.select_subset( question=next_question, answer=answer, product_set=product_set, traffic_set=traffic_set, purchased_set=purchased_set) question_set_new = set(product_set["PropertyDefinitionId"].values) question_set = question_set_new.difference(final_question_list) distinct_products = len(product_set.ProductId.unique()) # faster print("There are {} more questions we can ask".format( len(question_set))) print("There are {} possible products to choose from".format( distinct_products)) iter += 1 return final_question_list, product_set, y, final_question_text_list, answer_text_list
def dagger_get_questions(y, answers_y, model, question_text_df, answer_text_df, filters_def_dict, products_cat, number_filters): """ This function returns the list of questions for one sampled user with one trained instance of dagger. Note: You have to first trained the model and initialize it. Args: y: target productID for the sampled user answers_y: sampled answers for this product. model: trained model question_text_df: table to link questionId to text [PropertyDefinition, PropertyDefinitionId] answer_text_df: table to link answerId to text [answer_id, question_id, answer_text] filters_def_dict: dict where key is questionId value is array of all possible (modified) answers products_cat: extract of product catalog for category 6 number_filters: number of available questions Returns: final_question_list: sequence of questionId to ask product_set: final product list y: product chosen as input of algo final_question_text_list: sequence of questionText to ask answer_text_list: answers for each final question """ final_question_list = [] final_question_text_list = [] answer_text_list = [] # Restore the model from the checkpoint # Initial state state = {} # Loop until # products in products set < threshold while True: # Get list of questions already asked question_asked = state.keys() # Convert to one-hot one_ind_questions_asked = get_index_question(question_asked, filters_def_dict) # Create the mask before the softmax layer (cannot ask twice the same question) mask = np.ones(number_filters) for q in one_ind_questions_asked: # If question was already asked, set corresponding mask value to 0 mask[q] = 0 # Get one hot state encoding onehot_state = get_onehot_state(state, filters_def_dict) onehot_state = np.reshape(onehot_state, (1, -1)) mask = np.reshape(mask, (1, -1)) # Get predicted question from model for current state # Predict the one-hot label probas = model.predict({ 'main_input': onehot_state, 'mask_input': mask })[0] # if all the questions have already been ask # i.e. all have proba 0 then break # even if not reached threshold if np.sum(probas) == 0: break onehot_prediction = np.argmax(probas) # Get the number of predicted next question q_pred = sorted(filters_def_dict.keys())[onehot_prediction] question_text = question_id_to_text(q_pred, question_text_df) final_question_list.append(int(float(q_pred))) final_question_text_list.append(question_text) print("DAGGER: Question is: {}".format(question_text)) # Update (answer) state according to that prediction answers_to_pred = answers_y.get(float(q_pred)) answer_text = answer_id_to_text(answers_to_pred, q_pred, answer_text_df) print("DAGGER: Answer given was: id:{} text: {}".format( answers_to_pred, answer_text)) answer_text_list.append(answer_text) state[q_pred] = list(answers_to_pred) product_set, _, _ = get_products(state, products_cat, [], []) if len(np.unique(product_set['ProductId'])) < 50: break print('DAGGER: Return {} products.'.format( len(np.unique(product_set['ProductId'])))) return final_question_list, product_set, y, final_question_text_list, answer_text_list