def sort_product_candidates(product_tuple, quests_ids, testprods_rep_c, testquests_rep_c): distance_list = [] #list of tuples, (p_id, q_id, distance), sorted on tpl[2] for quest_id in quests_ids: question_representation = MyUtils_dbs.search_in_alltables_db(testquests_rep_c, "SELECT * FROM ", "WHERE id = '" + str(quest_id) + "'")[0] logging.debug("Question representation: %s", question_representation) question_tuple = MyUtils.quest_lstonamedtuple(question_representation, offset=1) pq_dist = CD.compute_dist_pq(product_tuple, question_tuple) distance_list.append((product_tuple.id, quest_id, pq_dist)) distance_list_sorted = sorted(distance_list, key=lambda tpl : tpl[2]) return distance_list_sorted
def get_instance_encoded_dictionary(prod_id, question_id, ps_db_c, qs_db_c, d2v_model): product_row = MyUtils_dbs.search_in_alltables_db( ps_db_c, "SELECT * FROM ", "WHERE id = '" + prod_id + "'") question_row = MyUtils_dbs.search_in_alltables_db( qs_db_c, "SELECT * FROM ", "WHERE id = '" + str(question_id) + "'") prod_tuple = MyUtils.prodls_tonamedtuple(product_row[0]) q_tuple = MyUtils.quest_lstonamedtuple(question_row[0]) instance_x = {} instance_x["p_descvec"] = MyUtils_strings.fromstring_toarray( prod_tuple.descvec) instance_x["p_titlevec"] = MyUtils_strings.fromstring_toarray( prod_tuple.titlevec) instance_x["p_kwsVectors"] = MyUtils_strings.fromlls_toarrays( prod_tuple.kwsVectors) #logging.debug("instance_x['p_kwsVectors'].shape : %s", np.array(instance_x["p_kwsVectors"]).shape) instance_x["p_mdcategories"] = MyUtils_strings.categories_to_vecs_lls( MyUtils_strings.fromlls_toarrays(prod_tuple.mdcategories), d2v_model) if len(np.array(instance_x["p_mdcategories"]).shape) >= 3: logging.debug("instance_x['p_mdcategories'].shape : %s", np.array(instance_x["p_mdcategories"]).shape) instance_x["p_mdcategories"] = instance_x["p_mdcategories"][0] instance_x["q_questionVec"] = MyUtils_strings.fromstring_toarray( q_tuple.questionVec) instance_x["q_questionType"] = q_tuple.questionType instance_x["q_kwsVectors"] = MyUtils_strings.fromlls_toarrays( q_tuple.kwsVectors) instance_y = 1 if q_tuple.id[0:10] in prod_id else 0 instance = namedtuple('instance', 'x y') inst = instance(x=instance_x, y=instance_y) return inst
def register_matches(product_featureflags, quest_featureflags, dataset_type, use_existing_file): allmatches_filepath = F.PRODSWITHQUESTS_IDS_ALL + dataset_type if use_existing_file: if os.path.exists(allmatches_filepath): if os.path.getsize(allmatches_filepath) > 0: logging.info( "The P-Q matches for the requested dataset were already found. They are located in the file:%s", allmatches_filepath) last_prod_id = "x" allmatches_file = open(file=allmatches_filepath, mode="r", newline='') reader = csv.reader(allmatches_file, delimiter='_', quotechar='"') reader.__next__() #skip header count_ps_withmatches = 0 while True: try: p_ls = reader.__next__() prod_id = p_ls[0] if prod_id != last_prod_id: count_ps_withmatches = count_ps_withmatches + 1 last_prod_id = prod_id except StopIteration: break allmatches_file.close() return count_ps_withmatches if dataset_type == MyUtils_flags.FLAG_VALID: ps_db_filepath = F.PRODS_NUMENCODING_DB_VALID qs_db_filepath = F.QUESTS_NUMENCODING_DB_VALID prods_filepath = F.PRODUCTS_FINAL_VALID quests_filepath = F.QUESTIONS_FINAL_VALID elif dataset_type == MyUtils_flags.FLAG_TEST: ps_db_filepath = F.PRODS_NUMENCODING_DB_TEST qs_db_filepath = F.QUESTS_NUMENCODING_DB_TEST prods_filepath = F.PRODUCTS_FINAL_TEST quests_filepath = F.QUESTIONS_FINAL_TEST else: #"train" ps_db_filepath = F.PRODS_NUMENCODING_DB_TRAIN qs_db_filepath = F.QUESTS_NUMENCODING_DB_TRAIN prods_filepath = F.PRODUCTS_FINAL_TRAIN quests_filepath = F.QUESTIONS_FINAL_TRAIN MyUtils.init_logging("RegisterMatches.log", logging.INFO) start = time() f = open(F.PRODSWITHQUESTS_IDS, "w") f.close() #clean outfile between runs ids_outfile = open(F.PRODSWITHQUESTS_IDS, "a") ids_outfile.write("id_questionsAsked\n") #connecting with the products, to filter them, based on the features we chose to include ps_db_conn = sqlite3.connect(ps_db_filepath) ps_db_cursor = ps_db_conn.cursor() # connecting with the questions, to filter them, based on the features we chose to include qs_db_conn = sqlite3.connect(qs_db_filepath) qs_db_cursor = qs_db_conn.cursor() prods_filehandler = open(prods_filepath, "r", newline='') quests_filehandler = open(quests_filepath, "r", newline='') reader_1 = csv.reader(prods_filehandler, delimiter='_', quotechar='"') reader_2 = csv.reader(quests_filehandler, delimiter='_', quotechar='"') num_prods_withmatches = 0 num_products_reviewed = 0 num_questions_reviewed = 0 last_prod_id = "x" questionsasked_ids_ls = [] ### init: reader_1.__next__() reader_2.__next__() reader_1.__next__() reader_2.__next__() #skip headers p_ls = reader_1.__next__() q_ls = reader_2.__next__() prod_t = MyUtils.prodls_tonamedtuple(p_ls, offset=0) quest_t = MyUtils.quest_lstonamedtuple(q_ls, offset=0) q_prod = (quest_t.id)[0:10] #loop: while True: try: match = False while not (match): while q_prod > prod_t.id or (len(q_prod) > len(prod_t.id)): logging.debug("%s < %s", prod_t.id, q_prod) p_ls = reader_1.__next__() #advance product num_products_reviewed = num_products_reviewed + 1 prod_t = MyUtils.prodls_tonamedtuple(p_ls, offset=0) while q_prod < prod_t.id or (len(q_prod) < len(prod_t.id)): logging.debug("%s > %s", prod_t.id, q_prod) q_ls = reader_2.__next__() #advance question num_questions_reviewed = num_questions_reviewed + 1 quest_t = MyUtils.quest_lstonamedtuple(q_ls, offset=0) q_prod = (quest_t.id)[0:10] if q_prod == prod_t.id: match = True #barrier: feature filtering on products and questions; DB lookup: if featurefilter_prod(prod_t.id, product_featureflags, ps_db_cursor) == True and \ featurefilter_quest(quest_t.id, quest_featureflags, qs_db_cursor) == True: logging.info("Match: product: %s , \t question: %s", prod_t.id, quest_t.id) #positive_qs_ids_file.write(str(quest_t.id) + "\n")#store the question id (positive example) if len(prod_t.id) > 5: if prod_t.id != last_prod_id: if len(last_prod_id) > 5: ids_outfile.write( str(last_prod_id) + "_" + str(questionsasked_ids_ls) + "\n") #write the previous p and qs questionsasked_ids_ls = [ ] #reset, and then append questionsasked_ids_ls.append(quest_t.id) last_prod_id = prod_t.id num_prods_withmatches = num_prods_withmatches + 1 #n: matches = number of products that have questions else: logging.info("***") questionsasked_ids_ls.append( quest_t.id ) #same product as previously; only append else: pass #on to the next question: q_ls = reader_2.__next__() quest_t = MyUtils.quest_lstonamedtuple(q_ls, offset=0) q_prod = (quest_t.id)[0:10] except StopIteration: exc_info = sys.exc_info() logging.warning("Exception information: %s", exc_info) break logging.info("Total number products that have matching questions: %s", num_prods_withmatches) logging.info("Products reviewed: %s", num_products_reviewed) logging.info("Questions reviewed: %s", num_questions_reviewed) copy(src=F.PRODSWITHQUESTS_IDS, dst=F.PRODSWITHQUESTS_IDS_ALL + dataset_type) end = time() logging.info("Time elapsed: %s", round(end - start, 4)) ids_outfile.close() prods_filehandler.close() quests_filehandler.close() #positive_qs_ids_file.close() return num_prods_withmatches
def attach_text_to_candidates(ranked_candidates_dbpath, prods_initial_dbpath, quests_initial_dbpath, prod_reps_dbpath, quest_reps_dbpath, final_outdb_path): MyUtils.init_logging("Attach_text_to_candidates.log") candidates_nn_db = sqlite3.connect(ranked_candidates_dbpath) cands_db_c = candidates_nn_db.cursor() f = open(F.RANKING_TEMP_DB, 'w'); f.close() temp_db = sqlite3.connect(F.RANKING_TEMP_DB) temp_db_c = temp_db.cursor() testprods_initial_c = sqlite3.connect(prods_initial_dbpath).cursor() testquests_initial_c = sqlite3.connect(quests_initial_dbpath).cursor() testprods_rep_c = sqlite3.connect(prod_reps_dbpath).cursor() testquests_rep_c = sqlite3.connect(quest_reps_dbpath).cursor() temp_db_c.execute('''CREATE TABLE candidates( p_id varchar(63), q_id varchar(63), distance int, p_titletext varchar(1023), p_descriptiontext varchar(8191), p_categorytext varchar (4095), q_text varchar (8191) )''') num_of_candidates = MyUtils_dbs.get_tot_num_rows_db(cands_db_c) logging.info(num_of_candidates) counter_questionsameid = 0 last_prod_id = 'x' for rowindex in range(1, num_of_candidates + 1): row = cands_db_c.execute("SELECT * FROM candidates WHERE rowid = ?", (rowindex,)).fetchone() #logging.info("info: %s", row) prod_id = row[0] quest_id = row[1] distance = row[2] if last_prod_id != prod_id: product_titleinfo,product_descinfo, product_categinfo = \ MyUtils_dbs.search_in_alltables_db(testprods_initial_c, "SELECT title, description, categories FROM", "WHERE asin = '" + str(prod_id) + "'")[0] product_representation = MyUtils_dbs.search_in_alltables_db(testprods_rep_c, "SELECT * FROM ", "WHERE id = '" + str(prod_id) + "'")[0] prod_tpl = MyUtils.prodls_tonamedtuple(product_representation, offset=1) counter_questionsameid = 0 ###get question's unixTime if len(quest_id)< 21: #format : @nan0 base_endpoint = 14 question_unixTime = str(quest_id[11:base_endpoint]) else: base_endpoint = 23 question_unixTime = str(quest_id[11:base_endpoint]) logging.debug("Question unixTime: %s", question_unixTime) if base_endpoint == 23: #if we have a valid unixTime specification possible_questions_text = MyUtils_dbs.search_in_alltables_db(testquests_initial_c, "SELECT question FROM", "WHERE asin = '" + str(quest_id[0:10]) + "'" + " AND unixTime LIKE '" + question_unixTime + "%'") else: #if we have NULL in the unixTime field possible_questions_text = MyUtils_dbs.search_in_alltables_db(testquests_initial_c, "SELECT question FROM", "WHERE asin = '" + str(quest_id[0:10]) + "'" + " AND unixTime IS NULL") base_q_id = str(quest_id[0:23]) possible_questions_reps = MyUtils_dbs.search_in_alltables_db(testquests_rep_c, "SELECT * FROM ", "WHERE id LIKE '" + str(base_q_id) + "%'") logging.debug("possible_questions_reps: %s", possible_questions_reps) logging.debug("possible_questions_text:%s", possible_questions_text) if len(possible_questions_text) > 1: possible_questions_tuples = list(map ( lambda q_ls : MyUtils.quest_lstonamedtuple(q_ls, offset=1), possible_questions_reps)) possible_questions_distances = list(map (lambda q_tpl : CD.compute_dist_pq(prod_tpl, q_tpl) , possible_questions_tuples)) qs_dist_lts = list(zip(possible_questions_tuples, possible_questions_distances)) qs_dist_lts_sorted = sorted( qs_dist_lts, key=lambda tpl : tpl[1]) #logging.info("sorted question tuples: %s", qs_dist_lts_sorted) question_textinfo = possible_questions_text[counter_questionsameid][0] counter_questionsameid= counter_questionsameid+1 else: question_textinfo = possible_questions_text[0][0] logging.debug("question_textinfo: %s", question_textinfo) temp_db_c.execute("INSERT INTO candidates VALUES (?,?,?,?,?,?,?)", (prod_id, quest_id, distance, product_titleinfo, product_descinfo, product_categinfo, question_textinfo)) logging.debug("***") temp_db.commit() os.rename(F.RANKING_TEMP_DB , final_outdb_path)