def sort_candidates(candidates_db_path, ranked_candidates_outdb_path, prod_reps_dbpath, quest_reps_dbpath): MyUtils.init_logging("Rank_candidates_nn.log") ### Connecting to the databases: candidates, test products, test questions candidates_nn_db = sqlite3.connect(candidates_db_path) cands_db_c = candidates_nn_db.cursor() testprods_rep_c = sqlite3.connect(prod_reps_dbpath).cursor() testquests_rep_c = sqlite3.connect(quest_reps_dbpath).cursor() f = open(ranked_candidates_outdb_path, "w"); f.close() outdb = sqlite3.connect(ranked_candidates_outdb_path) outdb_c = outdb.cursor() outdb_c.execute('''CREATE TABLE candidates( p_id varchar(63), q_id varchar(63), distance int )''') ### test_products_ids = cands_db_c.execute("SELECT DISTINCT p_id FROM candidates").fetchall() logging.info(test_products_ids[0]) #logging.debug(test_products_ids) for tpl_pid in test_products_ids: pid = tpl_pid[0] product_representation = MyUtils_dbs.search_in_alltables_db(testprods_rep_c, "SELECT * FROM ", "WHERE id = '" + str(pid) + "'")[0] product_tuple = MyUtils.prodls_tonamedtuple(product_representation, offset=1) quests_ids = list(map ( lambda results_tpl : results_tpl[0], cands_db_c.execute("SELECT q_id FROM candidates WHERE p_id = ?", tpl_pid).fetchall())) logging.debug(quests_ids) product_qs_sorted = sort_product_candidates(product_tuple, quests_ids, testprods_rep_c, testquests_rep_c) outdb.executemany("INSERT INTO candidates VALUES (?,?,?)", product_qs_sorted) outdb.commit()
def get_instance_encoded_dictionary(prod_id, question_id, ps_db_c, qs_db_c, d2v_model): product_row = MyUtils_dbs.search_in_alltables_db( ps_db_c, "SELECT * FROM ", "WHERE id = '" + prod_id + "'") question_row = MyUtils_dbs.search_in_alltables_db( qs_db_c, "SELECT * FROM ", "WHERE id = '" + str(question_id) + "'") prod_tuple = MyUtils.prodls_tonamedtuple(product_row[0]) q_tuple = MyUtils.quest_lstonamedtuple(question_row[0]) instance_x = {} instance_x["p_descvec"] = MyUtils_strings.fromstring_toarray( prod_tuple.descvec) instance_x["p_titlevec"] = MyUtils_strings.fromstring_toarray( prod_tuple.titlevec) instance_x["p_kwsVectors"] = MyUtils_strings.fromlls_toarrays( prod_tuple.kwsVectors) #logging.debug("instance_x['p_kwsVectors'].shape : %s", np.array(instance_x["p_kwsVectors"]).shape) instance_x["p_mdcategories"] = MyUtils_strings.categories_to_vecs_lls( MyUtils_strings.fromlls_toarrays(prod_tuple.mdcategories), d2v_model) if len(np.array(instance_x["p_mdcategories"]).shape) >= 3: logging.debug("instance_x['p_mdcategories'].shape : %s", np.array(instance_x["p_mdcategories"]).shape) instance_x["p_mdcategories"] = instance_x["p_mdcategories"][0] instance_x["q_questionVec"] = MyUtils_strings.fromstring_toarray( q_tuple.questionVec) instance_x["q_questionType"] = q_tuple.questionType instance_x["q_kwsVectors"] = MyUtils_strings.fromlls_toarrays( q_tuple.kwsVectors) instance_y = 1 if q_tuple.id[0:10] in prod_id else 0 instance = namedtuple('instance', 'x y') inst = instance(x=instance_x, y=instance_y) return inst
def register_matches(product_featureflags, quest_featureflags, dataset_type, use_existing_file): allmatches_filepath = F.PRODSWITHQUESTS_IDS_ALL + dataset_type if use_existing_file: if os.path.exists(allmatches_filepath): if os.path.getsize(allmatches_filepath) > 0: logging.info( "The P-Q matches for the requested dataset were already found. They are located in the file:%s", allmatches_filepath) last_prod_id = "x" allmatches_file = open(file=allmatches_filepath, mode="r", newline='') reader = csv.reader(allmatches_file, delimiter='_', quotechar='"') reader.__next__() #skip header count_ps_withmatches = 0 while True: try: p_ls = reader.__next__() prod_id = p_ls[0] if prod_id != last_prod_id: count_ps_withmatches = count_ps_withmatches + 1 last_prod_id = prod_id except StopIteration: break allmatches_file.close() return count_ps_withmatches if dataset_type == MyUtils_flags.FLAG_VALID: ps_db_filepath = F.PRODS_NUMENCODING_DB_VALID qs_db_filepath = F.QUESTS_NUMENCODING_DB_VALID prods_filepath = F.PRODUCTS_FINAL_VALID quests_filepath = F.QUESTIONS_FINAL_VALID elif dataset_type == MyUtils_flags.FLAG_TEST: ps_db_filepath = F.PRODS_NUMENCODING_DB_TEST qs_db_filepath = F.QUESTS_NUMENCODING_DB_TEST prods_filepath = F.PRODUCTS_FINAL_TEST quests_filepath = F.QUESTIONS_FINAL_TEST else: #"train" ps_db_filepath = F.PRODS_NUMENCODING_DB_TRAIN qs_db_filepath = F.QUESTS_NUMENCODING_DB_TRAIN prods_filepath = F.PRODUCTS_FINAL_TRAIN quests_filepath = F.QUESTIONS_FINAL_TRAIN MyUtils.init_logging("RegisterMatches.log", logging.INFO) start = time() f = open(F.PRODSWITHQUESTS_IDS, "w") f.close() #clean outfile between runs ids_outfile = open(F.PRODSWITHQUESTS_IDS, "a") ids_outfile.write("id_questionsAsked\n") #connecting with the products, to filter them, based on the features we chose to include ps_db_conn = sqlite3.connect(ps_db_filepath) ps_db_cursor = ps_db_conn.cursor() # connecting with the questions, to filter them, based on the features we chose to include qs_db_conn = sqlite3.connect(qs_db_filepath) qs_db_cursor = qs_db_conn.cursor() prods_filehandler = open(prods_filepath, "r", newline='') quests_filehandler = open(quests_filepath, "r", newline='') reader_1 = csv.reader(prods_filehandler, delimiter='_', quotechar='"') reader_2 = csv.reader(quests_filehandler, delimiter='_', quotechar='"') num_prods_withmatches = 0 num_products_reviewed = 0 num_questions_reviewed = 0 last_prod_id = "x" questionsasked_ids_ls = [] ### init: reader_1.__next__() reader_2.__next__() reader_1.__next__() reader_2.__next__() #skip headers p_ls = reader_1.__next__() q_ls = reader_2.__next__() prod_t = MyUtils.prodls_tonamedtuple(p_ls, offset=0) quest_t = MyUtils.quest_lstonamedtuple(q_ls, offset=0) q_prod = (quest_t.id)[0:10] #loop: while True: try: match = False while not (match): while q_prod > prod_t.id or (len(q_prod) > len(prod_t.id)): logging.debug("%s < %s", prod_t.id, q_prod) p_ls = reader_1.__next__() #advance product num_products_reviewed = num_products_reviewed + 1 prod_t = MyUtils.prodls_tonamedtuple(p_ls, offset=0) while q_prod < prod_t.id or (len(q_prod) < len(prod_t.id)): logging.debug("%s > %s", prod_t.id, q_prod) q_ls = reader_2.__next__() #advance question num_questions_reviewed = num_questions_reviewed + 1 quest_t = MyUtils.quest_lstonamedtuple(q_ls, offset=0) q_prod = (quest_t.id)[0:10] if q_prod == prod_t.id: match = True #barrier: feature filtering on products and questions; DB lookup: if featurefilter_prod(prod_t.id, product_featureflags, ps_db_cursor) == True and \ featurefilter_quest(quest_t.id, quest_featureflags, qs_db_cursor) == True: logging.info("Match: product: %s , \t question: %s", prod_t.id, quest_t.id) #positive_qs_ids_file.write(str(quest_t.id) + "\n")#store the question id (positive example) if len(prod_t.id) > 5: if prod_t.id != last_prod_id: if len(last_prod_id) > 5: ids_outfile.write( str(last_prod_id) + "_" + str(questionsasked_ids_ls) + "\n") #write the previous p and qs questionsasked_ids_ls = [ ] #reset, and then append questionsasked_ids_ls.append(quest_t.id) last_prod_id = prod_t.id num_prods_withmatches = num_prods_withmatches + 1 #n: matches = number of products that have questions else: logging.info("***") questionsasked_ids_ls.append( quest_t.id ) #same product as previously; only append else: pass #on to the next question: q_ls = reader_2.__next__() quest_t = MyUtils.quest_lstonamedtuple(q_ls, offset=0) q_prod = (quest_t.id)[0:10] except StopIteration: exc_info = sys.exc_info() logging.warning("Exception information: %s", exc_info) break logging.info("Total number products that have matching questions: %s", num_prods_withmatches) logging.info("Products reviewed: %s", num_products_reviewed) logging.info("Questions reviewed: %s", num_questions_reviewed) copy(src=F.PRODSWITHQUESTS_IDS, dst=F.PRODSWITHQUESTS_IDS_ALL + dataset_type) end = time() logging.info("Time elapsed: %s", round(end - start, 4)) ids_outfile.close() prods_filehandler.close() quests_filehandler.close() #positive_qs_ids_file.close() return num_prods_withmatches
def define_negative_examples(doc2vec_model, dataset_typeflag): MyUtils.init_logging("NN_Dataset_Instances-define_negative_examples.log", logging.INFO) f = open(F.PRODS_WITH_NOTASKEDQUESTS_IDS, "w") f.close() prodsnegativeqs_outfile = open(F.PRODS_WITH_NOTASKEDQUESTS_IDS, "a") prodsnegativeqs_outfile.write("id_questionsNotAsked\n") ### Connect with the database to read from: candidate negative examples db_conn = sqlite3.connect(F.CANDIDATE_NEGQS_DB) c = db_conn.cursor() ### IF we are working to create the training dataset, ### then we before allowing a question Q asked for P2 to be a negative example for P1, ### we check the similarity between P1 and P2 (it must not be too high) if dataset_typeflag == MyUtils_flags.FLAG_TRAIN: ### Determining the maximum allowed similarity between products. Creates the similarity db if it does not exist if os.path.exists(F.SIMILARITY_PRODUCTS_DB) == True: p_sim_breakpoint = ES.get_products_similarity_breakpoint( fraction=0.97) else: p_sim_breakpoint = ES.explore_products_similarity(N=500, fraction=0.97) ### Connect with the databases of product and questions representations, to be able to pick the products P1 and P2 product_reps_dbconn = sqlite3.connect(F.PRODUCTS_FINAL_TRAIN_DB) product_reps_c = product_reps_dbconn.cursor() segment_size = 10**4 for input_segment in pd.read_csv(F.PRODSWITHQUESTS_IDS, sep="_", chunksize=segment_size): for id_askedqs_t in input_segment.itertuples(): prod_id = id_askedqs_t.id #logging.debug("Reading from F.PRODSWITHQUESTS_IDS, the product.id is: %s", prod_id) asked_qs = ast.literal_eval(id_askedqs_t.questionsAsked) t = (prod_id, ) c.execute('SELECT * FROM prodnegatives WHERE prod_id=?', t) row = c.fetchone() if row is None: #i.e. if the product in the file PRODSWITHQUESTS_IDS was excluded from the previous random subsampling continue candidatenegativeqs_rawstring = row[1] candidatenegativeqs_string = "[" + candidatenegativeqs_rawstring[: -1] + "]" candidatenegativeqs_ls = ast.literal_eval( candidatenegativeqs_string) candidatenegativeqs_ls1 = [ q_id for q_id in candidatenegativeqs_ls if q_id not in asked_qs ] if dataset_typeflag == MyUtils_flags.FLAG_TRAIN: p1_row = MyUtils_dbs.search_in_alltables_db( dbcursor=product_reps_c, query_pretext="SELECT * FROM", query_aftertext=" WHERE id='" + str(prod_id) + "'")[0] candidatenegativeqs_asins = list( map(lambda q_id: q_id[0:10], candidatenegativeqs_ls1)) p2_rows = MyUtils_dbs.search_in_alltables_db( dbcursor=product_reps_c, query_pretext="SELECT * FROM", query_aftertext="WHERE id IN " + str(tuple(candidatenegativeqs_asins))) qids_and_p2rows = list(zip(candidatenegativeqs_ls1, p2_rows)) for q_id, p2_row in qids_and_p2rows: #logging.debug("p1_row : %s", p1_row) if p2_row is not None and len(p2_row) > 0: #there are questions without corresponding products, in which case no similarity check is to be done p1_tuple = MyUtils.prodls_tonamedtuple(p1_row) #[1:]? p2_tuple = MyUtils.prodls_tonamedtuple(p2_row) p1_p2_sim, _simparts = PS.compute_2products_similarity_singleprocess( prod1_tuple=p1_tuple, prod2_tuple=p2_tuple, d2v_model=doc2vec_model) if p1_p2_sim > p_sim_breakpoint: candidatenegativeqs_ls1.remove(q_id) logging.info( "Removing question from the candidate negative examples, " + "because the similarity between %s and %s is > %s", prod_id, p2_tuple.id, p_sim_breakpoint) logging.info( "Choosing negative examples: P-to-p similarity checks done for product: %s", prod_id) random_indices = sorted( np.random.choice(a=range(len(candidatenegativeqs_ls1)), size=min(len(candidatenegativeqs_ls1), len(asked_qs)), replace=False, p=None)) #logging.info(candidatenegativeqs_ls1) negativeqs_ls = [ candidatenegativeqs_ls1[i] for i in random_indices ] #logging.info(negativeqs_ls) prodsnegativeqs_outfile.write(prod_id + "_" + str(negativeqs_ls) + "\n") prodsnegativeqs_outfile.close()
def register_matches(): prods_filepath = F.PRODUCTS_FINAL_TRAIN quests_filepath = F.QUESTIONS_FINAL_TRAIN MyUtils.init_logging("OnlineLearning_RegisterMatches.log", logging.INFO) start = time() f = open(F.ONLINE_PQMATCHES, "w"); f.close()#clean outfile between runs ids_outfile = open(F.ONLINE_PQMATCHES, "a") ids_outfile.write("id_questionsAsked\n") prods_filehandler = open(prods_filepath, "r", newline='') quests_filehandler = open(quests_filepath, "r", newline='') reader_1 = csv.reader(prods_filehandler, delimiter='_', quotechar='"') reader_2 = csv.reader(quests_filehandler, delimiter='_', quotechar='"') num_prods_withmatches = 0 num_products_reviewed = 0 num_questions_reviewed = 0 last_prod_id = "x" questionsasked_ids_ls = [] ### init: reader_1.__next__(); reader_2.__next__() ; reader_1.__next__(); reader_2.__next__() #skip headers p_ls = reader_1.__next__() q_ls = reader_2.__next__() prod_t = MyUtils.prodls_tonamedtuple(p_ls, offset=0) quest_t = utilities.MyUtils.quest_lstonamedtuple(q_ls, offset=0) q_prod = (quest_t.id)[0:10] #loop: while True: try: match = False while not(match): while q_prod > prod_t.id or (len(q_prod) > len(prod_t.id)): logging.debug("%s < %s", prod_t.id , q_prod) p_ls = reader_1.__next__() #advance product num_products_reviewed = num_products_reviewed + 1 prod_t = utilities.MyUtils.prodls_tonamedtuple(p_ls, offset=0) while q_prod < prod_t.id or (len(q_prod) < len(prod_t.id)): logging.debug("%s > %s", prod_t.id, q_prod) q_ls = reader_2.__next__() #advance question num_questions_reviewed = num_questions_reviewed + 1 quest_t = utilities.MyUtils.quest_lstonamedtuple(q_ls, offset=0) q_prod = (quest_t.id)[0:10] if q_prod == prod_t.id: match = True #barrier: feature filtering on products and questions; DB lookup: logging.info("Match: product: %s , \t question: %s", prod_t.id, quest_t.id) #positive_qs_ids_file.write(str(quest_t.id) + "\n")#store the question id (positive example) if len(prod_t.id) > 5: if prod_t.id != last_prod_id: if len(last_prod_id) > 5: ids_outfile.write(str(last_prod_id) + "_" + str(questionsasked_ids_ls) + "\n")#write the previous p and qs questionsasked_ids_ls = [] #reset, and then append questionsasked_ids_ls.append(quest_t.id) last_prod_id = prod_t.id num_prods_withmatches = num_prods_withmatches +1 #n: matches = number of products that have questions else: logging.info("***") questionsasked_ids_ls.append(quest_t.id)#same product as previously; only append #on to the next question: q_ls = reader_2.__next__() quest_t = utilities.MyUtils.quest_lstonamedtuple(q_ls, offset=0) q_prod = (quest_t.id)[0:10] except StopIteration: logging.warning("Exception information: %s", exc_info()) break logging.info("Total number products that have matching questions: %s", num_prods_withmatches) logging.info("Products reviewed: %s", num_products_reviewed) logging.info("Questions reviewed: %s", num_questions_reviewed) end = time() logging.info("Time elapsed: %s", round(end - start,4)) ids_outfile.close() prods_filehandler.close() quests_filehandler.close() #positive_qs_ids_file.close() return num_prods_withmatches
def attach_text_to_candidates(ranked_candidates_dbpath, prods_initial_dbpath, quests_initial_dbpath, prod_reps_dbpath, quest_reps_dbpath, final_outdb_path): MyUtils.init_logging("Attach_text_to_candidates.log") candidates_nn_db = sqlite3.connect(ranked_candidates_dbpath) cands_db_c = candidates_nn_db.cursor() f = open(F.RANKING_TEMP_DB, 'w'); f.close() temp_db = sqlite3.connect(F.RANKING_TEMP_DB) temp_db_c = temp_db.cursor() testprods_initial_c = sqlite3.connect(prods_initial_dbpath).cursor() testquests_initial_c = sqlite3.connect(quests_initial_dbpath).cursor() testprods_rep_c = sqlite3.connect(prod_reps_dbpath).cursor() testquests_rep_c = sqlite3.connect(quest_reps_dbpath).cursor() temp_db_c.execute('''CREATE TABLE candidates( p_id varchar(63), q_id varchar(63), distance int, p_titletext varchar(1023), p_descriptiontext varchar(8191), p_categorytext varchar (4095), q_text varchar (8191) )''') num_of_candidates = MyUtils_dbs.get_tot_num_rows_db(cands_db_c) logging.info(num_of_candidates) counter_questionsameid = 0 last_prod_id = 'x' for rowindex in range(1, num_of_candidates + 1): row = cands_db_c.execute("SELECT * FROM candidates WHERE rowid = ?", (rowindex,)).fetchone() #logging.info("info: %s", row) prod_id = row[0] quest_id = row[1] distance = row[2] if last_prod_id != prod_id: product_titleinfo,product_descinfo, product_categinfo = \ MyUtils_dbs.search_in_alltables_db(testprods_initial_c, "SELECT title, description, categories FROM", "WHERE asin = '" + str(prod_id) + "'")[0] product_representation = MyUtils_dbs.search_in_alltables_db(testprods_rep_c, "SELECT * FROM ", "WHERE id = '" + str(prod_id) + "'")[0] prod_tpl = MyUtils.prodls_tonamedtuple(product_representation, offset=1) counter_questionsameid = 0 ###get question's unixTime if len(quest_id)< 21: #format : @nan0 base_endpoint = 14 question_unixTime = str(quest_id[11:base_endpoint]) else: base_endpoint = 23 question_unixTime = str(quest_id[11:base_endpoint]) logging.debug("Question unixTime: %s", question_unixTime) if base_endpoint == 23: #if we have a valid unixTime specification possible_questions_text = MyUtils_dbs.search_in_alltables_db(testquests_initial_c, "SELECT question FROM", "WHERE asin = '" + str(quest_id[0:10]) + "'" + " AND unixTime LIKE '" + question_unixTime + "%'") else: #if we have NULL in the unixTime field possible_questions_text = MyUtils_dbs.search_in_alltables_db(testquests_initial_c, "SELECT question FROM", "WHERE asin = '" + str(quest_id[0:10]) + "'" + " AND unixTime IS NULL") base_q_id = str(quest_id[0:23]) possible_questions_reps = MyUtils_dbs.search_in_alltables_db(testquests_rep_c, "SELECT * FROM ", "WHERE id LIKE '" + str(base_q_id) + "%'") logging.debug("possible_questions_reps: %s", possible_questions_reps) logging.debug("possible_questions_text:%s", possible_questions_text) if len(possible_questions_text) > 1: possible_questions_tuples = list(map ( lambda q_ls : MyUtils.quest_lstonamedtuple(q_ls, offset=1), possible_questions_reps)) possible_questions_distances = list(map (lambda q_tpl : CD.compute_dist_pq(prod_tpl, q_tpl) , possible_questions_tuples)) qs_dist_lts = list(zip(possible_questions_tuples, possible_questions_distances)) qs_dist_lts_sorted = sorted( qs_dist_lts, key=lambda tpl : tpl[1]) #logging.info("sorted question tuples: %s", qs_dist_lts_sorted) question_textinfo = possible_questions_text[counter_questionsameid][0] counter_questionsameid= counter_questionsameid+1 else: question_textinfo = possible_questions_text[0][0] logging.debug("question_textinfo: %s", question_textinfo) temp_db_c.execute("INSERT INTO candidates VALUES (?,?,?,?,?,?,?)", (prod_id, quest_id, distance, product_titleinfo, product_descinfo, product_categinfo, question_textinfo)) logging.debug("***") temp_db.commit() os.rename(F.RANKING_TEMP_DB , final_outdb_path)