コード例 #1
0
def create_history(traffic_cat, question_text_df):
    """ Create history dataframe of filters used with their frequency:
        Args:
            traffic_cat: traffic table [SessionId	answers_selected	Items_ProductId]
            question_text_df: table to link questionId to text [PropertyDefinition	PropertyDefinitionId]
        Returns:
            df_history: history dataframe of filters used [QuestionId	text	frequency]
    """
    # Compute the list of all the filters used in history
    list_filters_used = []
    [
        list_filters_used.append(k) for t in traffic_cat["answers_selected"]
        for k in t.keys()
    ]
    unique_filters = set(list_filters_used)
    df_history = pd.DataFrame(columns=["questionId", "text", "frequency"])
    total_freq = 0
    for f in unique_filters:
        question_text = question_id_to_text(f, question_text_df)
        if not question_text == 'No text equivalent for question':
            freq = list_filters_used.count(f)
            total_freq += freq
            df_history.loc[len(df_history)] = [f, question_text, freq]
    df_history["frequency"] = df_history["frequency"] / total_freq
    return df_history
コード例 #2
0
def random_baseline(product_set, traffic_set, purchased_set, question_text_df,
                    answer_text_df, threshold, y, answers_y):
    """Random baseline algorithm.
    Note: 
        at each timestep sample randomly one question among the remaining questions.

    Args:
        product_set: the initial set product available (catalog)
        traffic_set: the initial traffic dataset
        purchased_set: the initial purchased dataset
        question_text_df: dataframe containing the correspondance between questionIds and string.
        answer_text_df: dataframe containing the correspondance between answerIds and string.
        threshold: stopping criteria of the algorithm
        y: target product
        answers_y: sampled answers for target product

    Returns:
        final_question_list: final list of asked questionsIds
        product_set: final set of selected products
        y: target product
        final_question_text_list: final list of asked question (string)
        answer_text_list: final list of given answers (as string)
    """
    question_set = set(product_set["PropertyDefinitionId"].values)
    quest_answer_y = answers_y
    final_question_list = []
    final_question_text_list = []
    answer_text_list = []
    distinct_products = len(product_set.ProductId.unique())  # faster
    while not (distinct_products < threshold or len(question_set) == 0):
        next_question = np.random.choice(np.asarray(list(question_set)),
                                         size=1)[0]
        next_question = int(next_question)
        print("RDM: Next question is filter : {}".format(next_question))
        question_text = build_answers_utils.question_id_to_text(
            next_question, question_text_df)
        print("RDM: Question is: {}".format(question_text))
        final_question_list.append(int(next_question))
        final_question_text_list.append(question_text)
        answer = quest_answer_y[int(next_question)]
        answer_text = build_answers_utils.answer_id_to_text(
            answer, next_question, answer_text_df)
        print("RDM: Answer given was: {}".format(answer))
        print("RDM: Answer was: {}".format(answer_text))
        product_set, traffic_set, purchased_set = algo_utils.select_subset(
            question=int(next_question),
            answer=answer,
            product_set=product_set,
            traffic_set=traffic_set,
            purchased_set=purchased_set)
        question_set_new = set(product_set["PropertyDefinitionId"].values)
        question_set = question_set_new.difference(final_question_list)
        distinct_products = len(product_set.ProductId.unique())
        print('RDM: There are still {} products to choose from'.format(
            distinct_products))
    return final_question_list, product_set, y, final_question_text_list, answer_text_list
コード例 #3
0
    def __init__(self, product_set, traffic_set, purchased_set,
                 question_text_df, answer_text_df, threshold, filters_def_dict,
                 type_filters):
        self.use = 'dagger'
        # load the trained model if necessary
        if self.use == 'dagger':
            run = 'default'
            model_dir = '../training_dagger/{}'.format(run)
            checkpoint_model = model_dir + '/cp.ckpt'
            print('Loading the latest model from {}'.format(checkpoint_model))
            self.length_state = len(
                dagger_utils.get_onehot_state({}, filters_def_dict))
            self.number_filters = len(filters_def_dict.keys())
            self.model = create_model(self.number_filters, self.length_state)
            self.model.load_weights(checkpoint_model)
            self.state = {}

        # include all necessary data
        self.product_set = product_set
        self.traffic_set = traffic_set
        self.purchased_set = purchased_set
        self.question_text_df = question_text_df
        self.answer_text_df = answer_text_df
        self.threshold = threshold
        self.filters_def_dict = filters_def_dict
        self.type_filters = type_filters
        self.final_question_list = []
        self.question_set = set(algo_utils.get_questions(self.product_set))

        self.root = Tk()
        self.root.title("User Interface - Max_MI algo Test")
        self.mainframe = ttk.Frame(self.root)
        self.mainframe.grid(column=0, row=0, sticky=(N, W, E, S))
        self.root.columnconfigure(0, weight=2)
        self.root.rowconfigure(0, weight=1)

        # title of the interface - questions
        self.title = StringVar()
        self.title.set("Question 1")
        self.titleLabel = ttk.Label(self.mainframe,
                                    textvariable=self.title,
                                    font=("Helvetica", 18)).grid(column=2,
                                                                 row=1,
                                                                 columnspan=3,
                                                                 sticky=(W, E))

        # title of the question
        self.question = IntVar()

        # get first question
        if self.use == 'maxMI':
            self.next_question = opt_step(self.question_set,
                                          self.product_set,
                                          self.traffic_set,
                                          self.purchased_set,
                                          a_hist=1,
                                          df_history=df_history)
        else:
            self.next_question = dagger_utils.dagger_one_step(
                self.model, self.state, self.number_filters,
                self.filters_def_dict)

        self.question.set(self.next_question)
        self.question_text = StringVar()
        self.question_text.set(
            question_id_to_text(self.question.get(), question_text_df))
        self.questionLabel = ttk.Label(self.mainframe,
                                       textvariable=self.question_text).grid(
                                           column=2,
                                           row=4,
                                           columnspan=3,
                                           sticky=(W, E))

        # multiple choice list of answers
        self.answer_set = self.product_set.loc[
            self.product_set["PropertyDefinitionId"] ==
            int(self.question.get()), "answer"].drop_duplicates().values
        print("answer set: {}".format(self.answer_set))
        print("answer set: {}".format(type(self.answer_set)))
        print('int(self.question.get()): {}'.format(int(self.question.get())))
        self.text_answers = build_answers_utils.answer_id_to_text(
            self.answer_set, int(self.question.get()), self.answer_text_df)

        # Define the scroll bar for the question list
        self.yScroll = Scrollbar(self.mainframe, orient=VERTICAL)  # scroll bar
        self.yScroll.grid(row=6, column=1, sticky=N + S)

        # Define the list box
        listbox = Listbox(self.mainframe,
                          yscrollcommand=self.yScroll.set,
                          selectmode='multiple')
        for var in self.text_answers:
            listbox.insert(END, var)
        listbox.select_set(0)
        self.answerList = listbox
        self.answerList.grid(column=2, row=6, columnspan=5, sticky=W)

        # Labels
        self.nb_product_left = len(self.product_set["ProductId"].unique())
        self.nb_question_asked = 1
        self.product_left = StringVar()
        self.product_left.set('Nb products left {}'.format(
            self.nb_product_left))
        self.question_asked = StringVar()
        self.question_asked.set('Nb question asked {}'.format(
            self.nb_question_asked))
        self.productLeftLabel = ttk.Label(self.mainframe,
                                          textvariable=self.product_left).grid(
                                              column=2,
                                              row=16,
                                              columnspan=3,
                                              sticky=(W, E))
        self.questionAskedLabel = ttk.Label(
            self.mainframe,
            textvariable=self.question_asked).grid(column=2,
                                                   row=17,
                                                   columnspan=3,
                                                   sticky=(W, E))

        self.final_products = StringVar()

        # Main button Next question
        self.NextButton = ttk.Button(self.mainframe,
                                     text="Next",
                                     command=self.next).grid(column=7,
                                                             row=6,
                                                             sticky=W)
コード例 #4
0
    def next(self):
        """ This is the function called when you press next
        What does it do?
            1. modify the text of the question
            2. modify the list of the answers
            3. update nb product left
            4. update nb question asked
        """
        # Update answer as answer selected. If no answer given, then consider as 'idk'
        id_values = [
            self.answer_set[idx] for idx in self.answerList.curselection()
        ]
        if id_values == []:
            values = ['idk']
            print("values: {}".format(id_values))
        else:
            values = id_values
        print(self.answer_set)
        print(self.answerList.curselection())
        print("values: {}".format(values))
        self.state[self.next_question] = list(values)
        print(self.state)
        print("self.question.get(): {}".format(self.question.get()))
        # Updating product_set, traffic_set, purchased_set, answer_set and question set
        self.product_set, self.traffic_set, self.purchased_set = algo_utils.select_subset(
            question=self.question.get(),
            answer=values,
            product_set=self.product_set,
            traffic_set=self.traffic_set,
            purchased_set=self.purchased_set)
        self.question_set = set(algo_utils.get_questions(self.product_set))

        self.final_question_list.append(int(self.question.get()))
        print("Length Product set: {}".format(len(self.product_set)))
        question_set_new = set(algo_utils.get_questions(self.product_set))
        print("Length Question set new: {}".format(len(question_set_new)))
        print("Length Final question list: {}".format(
            len(self.final_question_list)))
        self.question_set = question_set_new.difference(
            self.final_question_list)
        print("Question set: {}".format(self.question_set))

        # Getting next question from our algo's opt_step
        if self.use == 'maxMI':
            self.next_question = opt_step(self.question_set,
                                          self.product_set,
                                          self.traffic_set,
                                          self.purchased_set,
                                          a_hist=1,
                                          df_history=df_history)
        else:
            self.next_question = dagger_utils.dagger_one_step(
                self.model, self.state, self.number_filters,
                self.filters_def_dict)

        print("Next question: {}".format(self.next_question))
        next_question_text = question_id_to_text(self.next_question,
                                                 self.question_text_df)

        # Updating number of questions asked
        self.nb_question_asked += 1
        self.question_asked.set('Nb question asked {}'.format(
            self.nb_question_asked))
        self.title.set("Question {}".format(self.nb_question_asked))

        # Updating number of products left
        self.nb_product_left = len(self.product_set["ProductId"].unique())
        self.product_left.set('Nb products left {}'.format(
            self.nb_product_left))

        # Updating question asked and question set
        self.question.set(self.next_question)
        self.question_text.set(next_question_text)

        # Getting the answers
        self.answer_set = self.product_set.loc[
            self.product_set["PropertyDefinitionId"] ==
            int(self.question.get()), "answer"].drop_duplicates().values
        self.text_answers = build_answers_utils.answer_id_to_text(
            self.answer_set, int(self.question.get()), self.answer_text_df)

        # If number of products lower than threshold, display final set of products
        print("Number products left: {}".format(
            len(self.product_set["ProductId"].drop_duplicates())))
        if (len(self.product_set["ProductId"].unique()) <
                self.threshold) or (len(self.text_answers) == 1):
            print("Threshold reached")
            win = Toplevel(self.root)
            win.title('Here is what we can offer you!')
            self.title.set("----   Your final Product Set   ----")
            self.titleLabel = ttk.Label(win,
                                        textvariable=self.title,
                                        font=("Helvetica",
                                              18)).grid(column=2,
                                                        row=1,
                                                        columnspan=10,
                                                        sticky=(W, E))
            self.final_productsLabel = ttk.Label(win, text="\n".join(map(str, self.product_set['ProductId'].unique()))) \
                                                .grid(column=2, row=4,columnspan=3, sticky=(W, E))
            self.quit()
            return 1

        # Getting new answer list
        try:
            self.answerList.selection_clear(
                0, 'end')  # clears selected answers IF user selected an answer
            print("Answers cleared")
        except:
            None

        listbox = Listbox(self.mainframe,
                          yscrollcommand=self.yScroll.set,
                          selectmode='multiple')
        for var in self.text_answers:
            listbox.insert(END, var)
        listbox.select_set(0)  # sets the first element
        self.answerList = listbox
        self.answerList.grid(column=2, row=6, columnspan=5, sticky=W)
        self.answer_set = self.product_set.loc[
            self.product_set["PropertyDefinitionId"] ==
            int(float(self.question.get())),
            "answer"].drop_duplicates().values.astype(float)
コード例 #5
0
def max_info_algorithm(product_set,
                       traffic_set,
                       purchased_set,
                       question_text_df,
                       answer_text_df,
                       threshold,
                       y,
                       answers_y,
                       a_hist=1,
                       df_history=0,
                       first_questions=None):
    """Maximan mutual information algorithm to select the best subset of questions to ask:
    Args:
        product_set: product table [ProductId, BrandId,
                                    ProductTypeId, PropertyValue,
                                    PropertyDefinitionId, 
                                    PropertyDefinitionOptionId, answer]
        traffic_set: traffic table [SessionId, answers_selected, 
                                    Items_ProductId]
        purchased_set: purchased table [ProductId, UserId, 
                                        OrderId, SessionId,
                                        Items_ProductId, Items_ItemCount]
        question_text_df: table to link questionId to text [PropertyDefinition,
                                                            PropertyDefinitionId]
        answer_text_df: table to link answerId to text [answer_id,
                                                        question_id,
                                                        answer_text]
        threshold: max length of final set of products
        y: product selected for the algorithm
        answers_y: dict of question: np.array(answers)
        a_hist (default = 0): parameter to determine the importance of history filters
        df_history (default = 0): history table [ProductId, text, frequency]
        first_questions (default = None): optimization step, precompute the firsts questions, create new if there are none
    
    Returns:
        final_question_list: sequence of questionId to ask
        product_set: final product list
        y: product chosen as input of algo
        final_question_text_list:  sequence of questionText to ask
        answer_text_list: answers for each final question
     """
    question_set = set(algo_utils.get_questions(product_set))
    final_question_list = []
    final_question_text_list = []
    answer_text_list = []
    distinct_products = product_set.ProductId.unique()
    print("There are {} questions we can ask".format(len(question_set)))
    print("There are {} possible products to choose from".format(
        len(distinct_products)))
    iter = 1

    # Compute the first 3 optimized questions for IDK answers (speed-up)
    if first_questions is None:
        first_questions = []
        first_question_set = question_set
        n_first_q = 3
        print("Optimization: computing first {} questions".format(n_first_q))
        for i in range(n_first_q):
            first_question = opt_step(first_question_set, product_set,
                                      traffic_set, purchased_set, a_hist,
                                      df_history)
            first_questions.append(first_question)
            first_question_set = first_question_set.difference(
                set(first_questions))

    # Given we have the first 3 best questions for IDK answer
    # we can use them until we receive a different answer
    n_first_q = len(first_questions)
    idk = True
    i = 0
    while (idk and i < n_first_q):
        next_question = first_questions[i]
        i += 1
        print("Next question is filter : {}".format(next_question))
        question_text = question_id_to_text(next_question, question_text_df)
        print("Question is: {}".format(question_text))
        final_question_list.append(int(next_question))
        final_question_text_list.append(question_text)
        answer = answers_y.get(next_question)
        if not answer == ["idk"]:
            idk = False
        answer_text = answer_id_to_text(answer, next_question, answer_text_df)
        print("Answer given was: {}".format(answer))
        print("Answer was: {}".format(answer_text))
        answer_text_list.append(answer_text)
        product_set, traffic_set, purchased_set = algo_utils.select_subset(
            question=next_question,
            answer=answer,
            product_set=product_set,
            traffic_set=traffic_set,
            purchased_set=purchased_set)
        question_set_new = set(product_set["PropertyDefinitionId"].values)
        question_set = question_set_new.difference(final_question_list)
        distinct_products = len(product_set.ProductId.unique())  # faster
        print("There are {} more questions we can ask".format(
            len(question_set)))
        print("There are {} possible products to choose from".format(
            distinct_products))
        iter += 1

    # Perform greedy step until the subset of products is smaller than a certain threshold
    while not (distinct_products < threshold or len(question_set) == 0):
        next_question = opt_step(question_set, product_set, traffic_set,
                                 purchased_set, a_hist, df_history)
        print("Next question is filter : {}".format(next_question))
        question_text = question_id_to_text(next_question, question_text_df)
        print("Question is: {}".format(question_text))
        final_question_list.append(int(next_question))
        final_question_text_list.append(question_text)
        answer = answers_y.get(next_question)
        answer_text = answer_id_to_text(answer, next_question, answer_text_df)
        print("Answer given was: {}".format(answer))
        print("Answer was: {}".format(answer_text))
        answer_text_list.append(answer_text)
        product_set, traffic_set, purchased_set = algo_utils.select_subset(
            question=next_question,
            answer=answer,
            product_set=product_set,
            traffic_set=traffic_set,
            purchased_set=purchased_set)
        question_set_new = set(product_set["PropertyDefinitionId"].values)
        question_set = question_set_new.difference(final_question_list)
        distinct_products = len(product_set.ProductId.unique())  # faster
        print("There are {} more questions we can ask".format(
            len(question_set)))
        print("There are {} possible products to choose from".format(
            distinct_products))
        iter += 1
    return final_question_list, product_set, y, final_question_text_list, answer_text_list
コード例 #6
0
def dagger_get_questions(y, answers_y, model, question_text_df, answer_text_df,
                         filters_def_dict, products_cat, number_filters):
    """ This function returns the list of questions for one sampled user with
    one trained instance of dagger.

    Note:
        You have to first trained the model and initialize it.

    Args:
        y: target productID for the sampled user
        answers_y: sampled answers for this product.
        model: trained model
        question_text_df: table to link questionId to text
                         [PropertyDefinition, PropertyDefinitionId]
        answer_text_df: table to link answerId to text 
                        [answer_id, question_id, answer_text]
        filters_def_dict: dict where key is questionId value 
                          is array of all possible (modified) answers
        products_cat: extract of product catalog for category 6
        number_filters: number of available questions
    
    Returns:
        final_question_list: sequence of questionId to ask
        product_set: final product list
        y: product chosen as input of algo
        final_question_text_list:  sequence of questionText to ask
        answer_text_list: answers for each final question
    """
    final_question_list = []
    final_question_text_list = []
    answer_text_list = []
    # Restore the model from the checkpoint
    # Initial state
    state = {}
    # Loop until # products in products set < threshold
    while True:
        # Get list of questions already asked
        question_asked = state.keys()
        # Convert to one-hot
        one_ind_questions_asked = get_index_question(question_asked,
                                                     filters_def_dict)
        # Create the mask before the softmax layer (cannot ask twice the same question)
        mask = np.ones(number_filters)
        for q in one_ind_questions_asked:
            # If question was already asked, set corresponding mask value to 0
            mask[q] = 0
        # Get one hot state encoding
        onehot_state = get_onehot_state(state, filters_def_dict)
        onehot_state = np.reshape(onehot_state, (1, -1))
        mask = np.reshape(mask, (1, -1))
        # Get predicted question from model for current state
        # Predict the one-hot label
        probas = model.predict({
            'main_input': onehot_state,
            'mask_input': mask
        })[0]
        # if all the questions have already been ask
        # i.e. all have proba 0 then break
        # even if not reached threshold
        if np.sum(probas) == 0:
            break
        onehot_prediction = np.argmax(probas)
        # Get the number of predicted next question
        q_pred = sorted(filters_def_dict.keys())[onehot_prediction]
        question_text = question_id_to_text(q_pred, question_text_df)
        final_question_list.append(int(float(q_pred)))
        final_question_text_list.append(question_text)
        print("DAGGER: Question is: {}".format(question_text))
        # Update (answer) state according to that prediction
        answers_to_pred = answers_y.get(float(q_pred))
        answer_text = answer_id_to_text(answers_to_pred, q_pred,
                                        answer_text_df)
        print("DAGGER: Answer given was: id:{} text: {}".format(
            answers_to_pred, answer_text))
        answer_text_list.append(answer_text)
        state[q_pred] = list(answers_to_pred)
        product_set, _, _ = get_products(state, products_cat, [], [])
        if len(np.unique(product_set['ProductId'])) < 50:
            break
    print('DAGGER: Return {} products.'.format(
        len(np.unique(product_set['ProductId']))))
    return final_question_list, product_set, y, final_question_text_list, answer_text_list