Beispiel #1
0
def sort_product_candidates(product_tuple, quests_ids, testprods_rep_c, testquests_rep_c):

    distance_list = [] #list of tuples, (p_id, q_id, distance), sorted on tpl[2]

    for quest_id in quests_ids:
        question_representation = MyUtils_dbs.search_in_alltables_db(testquests_rep_c, "SELECT * FROM ",
                                                             "WHERE id = '" + str(quest_id) + "'")[0]
        logging.debug("Question representation: %s", question_representation)
        question_tuple = MyUtils.quest_lstonamedtuple(question_representation, offset=1)
        pq_dist = CD.compute_dist_pq(product_tuple, question_tuple)
        distance_list.append((product_tuple.id, quest_id, pq_dist))

    distance_list_sorted = sorted(distance_list, key=lambda tpl : tpl[2])
    return distance_list_sorted
Beispiel #2
0
def get_instance_encoded_dictionary(prod_id, question_id, ps_db_c, qs_db_c,
                                    d2v_model):

    product_row = MyUtils_dbs.search_in_alltables_db(
        ps_db_c, "SELECT * FROM ", "WHERE id = '" + prod_id + "'")
    question_row = MyUtils_dbs.search_in_alltables_db(
        qs_db_c, "SELECT * FROM ", "WHERE id = '" + str(question_id) + "'")
    prod_tuple = MyUtils.prodls_tonamedtuple(product_row[0])
    q_tuple = MyUtils.quest_lstonamedtuple(question_row[0])

    instance_x = {}
    instance_x["p_descvec"] = MyUtils_strings.fromstring_toarray(
        prod_tuple.descvec)
    instance_x["p_titlevec"] = MyUtils_strings.fromstring_toarray(
        prod_tuple.titlevec)
    instance_x["p_kwsVectors"] = MyUtils_strings.fromlls_toarrays(
        prod_tuple.kwsVectors)
    #logging.debug("instance_x['p_kwsVectors'].shape : %s", np.array(instance_x["p_kwsVectors"]).shape)
    instance_x["p_mdcategories"] = MyUtils_strings.categories_to_vecs_lls(
        MyUtils_strings.fromlls_toarrays(prod_tuple.mdcategories), d2v_model)
    if len(np.array(instance_x["p_mdcategories"]).shape) >= 3:
        logging.debug("instance_x['p_mdcategories'].shape : %s",
                      np.array(instance_x["p_mdcategories"]).shape)
        instance_x["p_mdcategories"] = instance_x["p_mdcategories"][0]

    instance_x["q_questionVec"] = MyUtils_strings.fromstring_toarray(
        q_tuple.questionVec)
    instance_x["q_questionType"] = q_tuple.questionType
    instance_x["q_kwsVectors"] = MyUtils_strings.fromlls_toarrays(
        q_tuple.kwsVectors)

    instance_y = 1 if q_tuple.id[0:10] in prod_id else 0
    instance = namedtuple('instance', 'x y')
    inst = instance(x=instance_x, y=instance_y)

    return inst
def register_matches(product_featureflags, quest_featureflags, dataset_type,
                     use_existing_file):
    allmatches_filepath = F.PRODSWITHQUESTS_IDS_ALL + dataset_type
    if use_existing_file:
        if os.path.exists(allmatches_filepath):
            if os.path.getsize(allmatches_filepath) > 0:
                logging.info(
                    "The P-Q matches for the requested dataset were already found. They are located in the file:%s",
                    allmatches_filepath)
                last_prod_id = "x"
                allmatches_file = open(file=allmatches_filepath,
                                       mode="r",
                                       newline='')
                reader = csv.reader(allmatches_file,
                                    delimiter='_',
                                    quotechar='"')
                reader.__next__()  #skip header
                count_ps_withmatches = 0
                while True:
                    try:
                        p_ls = reader.__next__()
                        prod_id = p_ls[0]
                        if prod_id != last_prod_id:
                            count_ps_withmatches = count_ps_withmatches + 1
                            last_prod_id = prod_id
                    except StopIteration:
                        break
                allmatches_file.close()
                return count_ps_withmatches

    if dataset_type == MyUtils_flags.FLAG_VALID:
        ps_db_filepath = F.PRODS_NUMENCODING_DB_VALID
        qs_db_filepath = F.QUESTS_NUMENCODING_DB_VALID
        prods_filepath = F.PRODUCTS_FINAL_VALID
        quests_filepath = F.QUESTIONS_FINAL_VALID
    elif dataset_type == MyUtils_flags.FLAG_TEST:
        ps_db_filepath = F.PRODS_NUMENCODING_DB_TEST
        qs_db_filepath = F.QUESTS_NUMENCODING_DB_TEST
        prods_filepath = F.PRODUCTS_FINAL_TEST
        quests_filepath = F.QUESTIONS_FINAL_TEST
    else:  #"train"
        ps_db_filepath = F.PRODS_NUMENCODING_DB_TRAIN
        qs_db_filepath = F.QUESTS_NUMENCODING_DB_TRAIN
        prods_filepath = F.PRODUCTS_FINAL_TRAIN
        quests_filepath = F.QUESTIONS_FINAL_TRAIN

    MyUtils.init_logging("RegisterMatches.log", logging.INFO)
    start = time()
    f = open(F.PRODSWITHQUESTS_IDS, "w")
    f.close()  #clean outfile between runs
    ids_outfile = open(F.PRODSWITHQUESTS_IDS, "a")
    ids_outfile.write("id_questionsAsked\n")

    #connecting with the products, to filter them, based on the features we chose to include
    ps_db_conn = sqlite3.connect(ps_db_filepath)
    ps_db_cursor = ps_db_conn.cursor()
    # connecting with the questions, to filter them, based on the features we chose to include
    qs_db_conn = sqlite3.connect(qs_db_filepath)
    qs_db_cursor = qs_db_conn.cursor()

    prods_filehandler = open(prods_filepath, "r", newline='')
    quests_filehandler = open(quests_filepath, "r", newline='')
    reader_1 = csv.reader(prods_filehandler, delimiter='_', quotechar='"')
    reader_2 = csv.reader(quests_filehandler, delimiter='_', quotechar='"')

    num_prods_withmatches = 0
    num_products_reviewed = 0
    num_questions_reviewed = 0
    last_prod_id = "x"
    questionsasked_ids_ls = []
    ### init:
    reader_1.__next__()
    reader_2.__next__()
    reader_1.__next__()
    reader_2.__next__()  #skip headers
    p_ls = reader_1.__next__()
    q_ls = reader_2.__next__()
    prod_t = MyUtils.prodls_tonamedtuple(p_ls, offset=0)
    quest_t = MyUtils.quest_lstonamedtuple(q_ls, offset=0)
    q_prod = (quest_t.id)[0:10]
    #loop:
    while True:
        try:
            match = False
            while not (match):
                while q_prod > prod_t.id or (len(q_prod) > len(prod_t.id)):
                    logging.debug("%s < %s", prod_t.id, q_prod)
                    p_ls = reader_1.__next__()  #advance product
                    num_products_reviewed = num_products_reviewed + 1
                    prod_t = MyUtils.prodls_tonamedtuple(p_ls, offset=0)

                while q_prod < prod_t.id or (len(q_prod) < len(prod_t.id)):
                    logging.debug("%s > %s", prod_t.id, q_prod)
                    q_ls = reader_2.__next__()  #advance question
                    num_questions_reviewed = num_questions_reviewed + 1
                    quest_t = MyUtils.quest_lstonamedtuple(q_ls, offset=0)
                    q_prod = (quest_t.id)[0:10]

                if q_prod == prod_t.id:
                    match = True
                    #barrier: feature filtering on products and questions; DB lookup:
                    if featurefilter_prod(prod_t.id, product_featureflags, ps_db_cursor) == True and \
                       featurefilter_quest(quest_t.id, quest_featureflags, qs_db_cursor) == True:
                        logging.info("Match: product: %s , \t question: %s",
                                     prod_t.id, quest_t.id)
                        #positive_qs_ids_file.write(str(quest_t.id) + "\n")#store the question id (positive example)
                        if len(prod_t.id) > 5:
                            if prod_t.id != last_prod_id:
                                if len(last_prod_id) > 5:
                                    ids_outfile.write(
                                        str(last_prod_id) + "_" +
                                        str(questionsasked_ids_ls) +
                                        "\n")  #write the previous p and qs
                                questionsasked_ids_ls = [
                                ]  #reset, and then append
                                questionsasked_ids_ls.append(quest_t.id)
                                last_prod_id = prod_t.id
                                num_prods_withmatches = num_prods_withmatches + 1  #n: matches = number of products that have questions
                            else:
                                logging.info("***")
                                questionsasked_ids_ls.append(
                                    quest_t.id
                                )  #same product as previously; only append
                    else:
                        pass
                    #on to the next question:
                    q_ls = reader_2.__next__()
                    quest_t = MyUtils.quest_lstonamedtuple(q_ls, offset=0)
                    q_prod = (quest_t.id)[0:10]

        except StopIteration:
            exc_info = sys.exc_info()
            logging.warning("Exception information: %s", exc_info)
            break
    logging.info("Total number products that have matching questions: %s",
                 num_prods_withmatches)
    logging.info("Products reviewed: %s", num_products_reviewed)
    logging.info("Questions reviewed: %s", num_questions_reviewed)

    copy(src=F.PRODSWITHQUESTS_IDS,
         dst=F.PRODSWITHQUESTS_IDS_ALL + dataset_type)

    end = time()
    logging.info("Time elapsed: %s", round(end - start, 4))
    ids_outfile.close()
    prods_filehandler.close()
    quests_filehandler.close()
    #positive_qs_ids_file.close()
    return num_prods_withmatches
Beispiel #4
0
def attach_text_to_candidates(ranked_candidates_dbpath, prods_initial_dbpath, quests_initial_dbpath, prod_reps_dbpath, quest_reps_dbpath, final_outdb_path):
    MyUtils.init_logging("Attach_text_to_candidates.log")

    candidates_nn_db = sqlite3.connect(ranked_candidates_dbpath)
    cands_db_c = candidates_nn_db.cursor()
    f = open(F.RANKING_TEMP_DB, 'w'); f.close()
    temp_db = sqlite3.connect(F.RANKING_TEMP_DB)
    temp_db_c = temp_db.cursor()
    testprods_initial_c = sqlite3.connect(prods_initial_dbpath).cursor()
    testquests_initial_c = sqlite3.connect(quests_initial_dbpath).cursor()
    testprods_rep_c = sqlite3.connect(prod_reps_dbpath).cursor()
    testquests_rep_c = sqlite3.connect(quest_reps_dbpath).cursor()

    temp_db_c.execute('''CREATE TABLE candidates(  p_id varchar(63),
                                                   q_id varchar(63),
                                                   distance int,
                                                   p_titletext varchar(1023),
                                                   p_descriptiontext varchar(8191),
                                                   p_categorytext varchar (4095),
                                                   q_text varchar (8191)         
                                            )''')

    num_of_candidates = MyUtils_dbs.get_tot_num_rows_db(cands_db_c)
    logging.info(num_of_candidates)
    counter_questionsameid = 0
    last_prod_id = 'x'

    for rowindex in range(1, num_of_candidates + 1):
        row = cands_db_c.execute("SELECT * FROM candidates WHERE rowid = ?", (rowindex,)).fetchone()
        #logging.info("info: %s", row)
        prod_id = row[0]
        quest_id = row[1]
        distance = row[2]

        if last_prod_id != prod_id:
            product_titleinfo,product_descinfo, product_categinfo = \
                MyUtils_dbs.search_in_alltables_db(testprods_initial_c, "SELECT title, description, categories FROM",
                                                                  "WHERE asin = '" + str(prod_id) + "'")[0]
            product_representation = MyUtils_dbs.search_in_alltables_db(testprods_rep_c, "SELECT * FROM ",
                                                                        "WHERE id = '" + str(prod_id) + "'")[0]
            prod_tpl = MyUtils.prodls_tonamedtuple(product_representation, offset=1)

            counter_questionsameid = 0

        ###get question's unixTime
        if len(quest_id)< 21: #format : @nan0
            base_endpoint = 14
            question_unixTime = str(quest_id[11:base_endpoint])
        else:
            base_endpoint = 23
            question_unixTime = str(quest_id[11:base_endpoint])
        logging.debug("Question unixTime: %s", question_unixTime)

        if base_endpoint == 23: #if we have a valid unixTime specification

            possible_questions_text = MyUtils_dbs.search_in_alltables_db(testquests_initial_c, "SELECT question FROM",
                                                                  "WHERE asin = '" + str(quest_id[0:10]) + "'"
                                                              + " AND unixTime LIKE '" + question_unixTime + "%'")
        else: #if we have NULL in the unixTime field
            possible_questions_text = MyUtils_dbs.search_in_alltables_db(testquests_initial_c, "SELECT question FROM",
                                                                         "WHERE asin = '" + str(quest_id[0:10]) + "'"
                                                                         + " AND unixTime IS NULL")
        base_q_id = str(quest_id[0:23])
        possible_questions_reps = MyUtils_dbs.search_in_alltables_db(testquests_rep_c, "SELECT * FROM ",
                                                             "WHERE id LIKE '" + str(base_q_id) + "%'")
        logging.debug("possible_questions_reps: %s", possible_questions_reps)
        logging.debug("possible_questions_text:%s", possible_questions_text)

        if len(possible_questions_text) > 1:
            possible_questions_tuples = list(map ( lambda q_ls : MyUtils.quest_lstonamedtuple(q_ls, offset=1), possible_questions_reps))
            possible_questions_distances = list(map (lambda q_tpl : CD.compute_dist_pq(prod_tpl, q_tpl) , possible_questions_tuples))

            qs_dist_lts = list(zip(possible_questions_tuples, possible_questions_distances))
            qs_dist_lts_sorted = sorted( qs_dist_lts, key=lambda tpl : tpl[1])
            #logging.info("sorted question tuples: %s", qs_dist_lts_sorted)
            question_textinfo = possible_questions_text[counter_questionsameid][0]
            counter_questionsameid= counter_questionsameid+1
        else:
            question_textinfo = possible_questions_text[0][0]
        logging.debug("question_textinfo: %s", question_textinfo)

        temp_db_c.execute("INSERT INTO candidates VALUES (?,?,?,?,?,?,?)", (prod_id, quest_id, distance,
                                                                          product_titleinfo, product_descinfo, product_categinfo, question_textinfo))
        logging.debug("***")

    temp_db.commit()
    os.rename(F.RANKING_TEMP_DB , final_outdb_path)