Esempio n. 1
0
def write_negative_instances(num_ps_rows, num_qs_rows, prods_db_c,
                             num_random_qs_per_prod, quests_db_c, outc, outdb):

    for rowid in (range(1, num_ps_rows + 1)):
        p_id = MyUtils_dbs.search_in_alltables_db(
            prods_db_c, "SELECT id FROM", "WHERE rowid = " + str(rowid))[0][0]
        neg_qs_ids = []
        neg_qs_indices = np.random.choice(a=range(1, num_qs_rows),
                                          size=num_random_qs_per_prod,
                                          replace=False)
        for neg_qs_index in neg_qs_indices:
            neg_qs_ids.append(
                MyUtils_dbs.search_in_alltables_db(
                    quests_db_c, "SELECT id FROM",
                    "WHERE `index` = " + str(neg_qs_index))[0][0])

        if product_has_allfeatures(prods_db_c,
                                   p_id) and allquestions_have_allfeatures(
                                       quests_db_c, str(neg_qs_ids)):
            insertion_sequence = [(p_id, q_id, 0) for q_id in neg_qs_ids]
            outc.executemany("INSERT INTO instances VALUES (?,?,?)",
                             insertion_sequence)
        else:
            logging.info(
                "Product %s excluded from the instances due to not having all the features",
                p_id)

        if rowid % (num_ps_rows // 10) == 0:
            logging.info("Working on category: +10%%...")
            outdb.commit()
Esempio n. 2
0
def sort_candidates(candidates_db_path, ranked_candidates_outdb_path, prod_reps_dbpath, quest_reps_dbpath):
    MyUtils.init_logging("Rank_candidates_nn.log")
    ### Connecting to the databases: candidates, test products, test questions
    candidates_nn_db = sqlite3.connect(candidates_db_path)
    cands_db_c = candidates_nn_db.cursor()

    testprods_rep_c = sqlite3.connect(prod_reps_dbpath).cursor()
    testquests_rep_c = sqlite3.connect(quest_reps_dbpath).cursor()

    f = open(ranked_candidates_outdb_path, "w"); f.close()
    outdb = sqlite3.connect(ranked_candidates_outdb_path)
    outdb_c = outdb.cursor()
    outdb_c.execute('''CREATE TABLE candidates(    p_id varchar(63),
                                                   q_id varchar(63),
                                                   distance int        
                                            )''')
    ###

    test_products_ids = cands_db_c.execute("SELECT DISTINCT p_id FROM candidates").fetchall()
    logging.info(test_products_ids[0])
    #logging.debug(test_products_ids)
    for tpl_pid in test_products_ids:
        pid = tpl_pid[0]
        product_representation = MyUtils_dbs.search_in_alltables_db(testprods_rep_c, "SELECT * FROM ",
                                                                    "WHERE id = '" + str(pid) + "'")[0]
        product_tuple = MyUtils.prodls_tonamedtuple(product_representation, offset=1)
        quests_ids = list(map ( lambda results_tpl : results_tpl[0], cands_db_c.execute("SELECT q_id FROM candidates WHERE p_id = ?", tpl_pid).fetchall()))
        logging.debug(quests_ids)
        product_qs_sorted = sort_product_candidates(product_tuple, quests_ids, testprods_rep_c, testquests_rep_c)
        outdb.executemany("INSERT INTO candidates VALUES (?,?,?)", product_qs_sorted)
    outdb.commit()
Esempio n. 3
0
def shuffle_db_table(db_path):
    db_conn = sqlite3.connect(db_path)
    c = db_conn.cursor()

    temp_db = db_path + "_temp"
    f = open(temp_db, "w")
    f.close()  #clean outdb
    outdb_conn = sqlite3.connect(temp_db)
    outc = outdb_conn.cursor()
    outc.execute('''CREATE TABLE instances(p varchar(63),
                                           q varchar(63),
                                           y tinyint)  ''')
    outdb_conn.commit()

    tot_num_of_rows = MyUtils_dbs.get_tot_num_rows_db(c)
    rand_indices = np.random.choice(range(1, tot_num_of_rows + 1),
                                    tot_num_of_rows,
                                    replace=False)

    for ind in rand_indices:
        picked_row = c.execute("SELECT * FROM instances WHERE rowid = " +
                               str(ind)).fetchone()
        p = picked_row[0]
        q = picked_row[1]
        y = picked_row[2]
        outc.execute('''INSERT INTO instances VALUES (?,?,?);''',
                     (str(p), str(q), str(y)))
    outdb_conn.commit()
    logging.info("Instances have been shuffled.")

    os.rename(src=temp_db, dst=db_path)
Esempio n. 4
0
def obtain_category_instances(category_dirpath, categ_products_db,
                              categ_questions_db, max_neg_cardinality):
    logging.info("Extracting instances for the category: %s",
                 os.path.basename(category_dirpath))
    prods_db_c = categ_products_db.cursor()
    quests_db_c = categ_questions_db.cursor()

    outdbname = MyUtils_flags.FLAG_INSTANCEIDS + ".db"
    f = open(os.path.join(category_dirpath, outdbname), "w")
    f.close()
    outdb = sqlite3.connect(os.path.join(category_dirpath, outdbname))
    outc = outdb.cursor()
    outc.execute('''CREATE TABLE instances(p varchar(63),
                                           q varchar(63),
                                           y tinyint)  ''')
    outdb.commit()

    ### Get the number of Ps and Qs. Generally, |Ps| << |Qs| (eg. 119,43608)
    num_ps_rows = MyUtils_dbs.get_tot_num_rows_db(prods_db_c)
    logging.info("Number of products in category: %s", num_ps_rows)
    num_qs_rows = MyUtils_dbs.get_tot_num_rows_db(quests_db_c)
    logging.info("Number of questions in category: %s", num_qs_rows)
    num_possible_instances = num_ps_rows * num_qs_rows
    logging.info("Potential total number of instances from the category: %s",
                 num_possible_instances)
    cardinality = min(num_possible_instances, max_neg_cardinality)
    logging.info(
        "Considering the upper boundary, the number of negative instances to include in the category dataset is: %s ",
        cardinality)
    num_random_qs_per_prod = cardinality // num_ps_rows
    logging.info("Number of random negative examples per product: %s",
                 num_random_qs_per_prod)

    write_positive_instances(num_ps_rows, prods_db_c, quests_db_c, outc)
    outdb.commit()
    write_negative_instances(num_ps_rows, num_qs_rows, prods_db_c,
                             num_random_qs_per_prod, quests_db_c, outc, outdb)
    outdb.commit()
    shuffle_db_table(os.path.join(category_dirpath, outdbname))

    categ_products_db.close()
    categ_questions_db.close()
    outdb.close()
Esempio n. 5
0
def write_positive_instances(num_ps_rows, prods_db_c, quests_db_c, outc):
    ###Iterate over the products:
    for rowid in (range(1, num_ps_rows + 1)):
        p_id = MyUtils_dbs.search_in_alltables_db(
            prods_db_c, "SELECT id FROM", "WHERE rowid = " + str(rowid))[0][0]

        ###Get all the Qs asked for the selected Ps; they will always be part of the dataset,since there are so few Ps
        q_ids_results = MyUtils_dbs.search_in_alltables_db(
            quests_db_c, "SELECT id FROM",
            "WHERE id LIKE '" + str(p_id) + "%'")
        q_ids_ls = [tpl[0] for tpl in q_ids_results]
        #filter: p and qs must have all features
        if product_has_allfeatures(prods_db_c,
                                   p_id) and allquestions_have_allfeatures(
                                       quests_db_c, str(q_ids_ls)):
            insertion_sequence = [(p_id, q_id, 1) for q_id in q_ids_ls]
            outc.executemany("INSERT INTO instances VALUES (?,?,?)",
                             insertion_sequence)
        else:
            logging.info(
                "Product %s excluded from the instances due to not having all the features",
                p_id)
Esempio n. 6
0
def generator_of_batches(batch_size, dataset_type):
    if dataset_type == MyUtils_flags.FLAG_TRAIN:
        dataset_length = MyUtils_dbs.get_nn_dataset_length(
            MyUtils_flags.FLAG_TRAIN)
        db_conn = sqlite3.connect(F.NN_TRAIN_INSTANCES_DB)
    if dataset_type == MyUtils_flags.FLAG_VALID:
        dataset_length = MyUtils_dbs.get_nn_dataset_length(
            MyUtils_flags.FLAG_VALID)
        db_conn = sqlite3.connect(F.NN_VALID_INSTANCES_DB)
    if dataset_type == MyUtils_flags.FLAG_TEST:
        dataset_length = MyUtils_dbs.get_nn_dataset_length(
            MyUtils_flags.FLAG_TEST)
        db_conn = sqlite3.connect(F.NN_TEST_INSTANCES_DB)

    c = db_conn.cursor()
    num_of_batches = dataset_length // batch_size + 1
    half_mark_offset = dataset_length // 2

    for i in range(0, num_of_batches):
        start_index_pos = i * (batch_size // 2)
        end_index_pos = min((i + 1) * (batch_size // 2), half_mark_offset)
        start_index_neg = half_mark_offset + i * (batch_size // 2)
        end_index_neg = half_mark_offset + min(
            (i + 1) * (batch_size // 2), dataset_length)

        c.execute("SELECT p_id, q_id, x,y FROM instances WHERE rowid IN " +
                  str(tuple(range(start_index_pos, end_index_pos))))
        rows = c.fetchall()
        c.execute("SELECT p_id, q_id, x,y FROM instances WHERE rowid IN " +
                  str(tuple(range(start_index_neg, end_index_neg))))
        rows_neg = c.fetchall()

        rows.extend(rows_neg)
        batch = list(
            map(
                lambda elem: (str(elem[0]), str(elem[1]), json.loads(elem[2]),
                              int(elem[3])), rows))
        yield batch
Esempio n. 7
0
def sort_product_candidates(product_tuple, quests_ids, testprods_rep_c, testquests_rep_c):

    distance_list = [] #list of tuples, (p_id, q_id, distance), sorted on tpl[2]

    for quest_id in quests_ids:
        question_representation = MyUtils_dbs.search_in_alltables_db(testquests_rep_c, "SELECT * FROM ",
                                                             "WHERE id = '" + str(quest_id) + "'")[0]
        logging.debug("Question representation: %s", question_representation)
        question_tuple = MyUtils.quest_lstonamedtuple(question_representation, offset=1)
        pq_dist = CD.compute_dist_pq(product_tuple, question_tuple)
        distance_list.append((product_tuple.id, quest_id, pq_dist))

    distance_list_sorted = sorted(distance_list, key=lambda tpl : tpl[2])
    return distance_list_sorted
Esempio n. 8
0
def get_instance_encoded_dictionary(prod_id, question_id, ps_db_c, qs_db_c,
                                    d2v_model):

    product_row = MyUtils_dbs.search_in_alltables_db(
        ps_db_c, "SELECT * FROM ", "WHERE id = '" + prod_id + "'")
    question_row = MyUtils_dbs.search_in_alltables_db(
        qs_db_c, "SELECT * FROM ", "WHERE id = '" + str(question_id) + "'")
    prod_tuple = MyUtils.prodls_tonamedtuple(product_row[0])
    q_tuple = MyUtils.quest_lstonamedtuple(question_row[0])

    instance_x = {}
    instance_x["p_descvec"] = MyUtils_strings.fromstring_toarray(
        prod_tuple.descvec)
    instance_x["p_titlevec"] = MyUtils_strings.fromstring_toarray(
        prod_tuple.titlevec)
    instance_x["p_kwsVectors"] = MyUtils_strings.fromlls_toarrays(
        prod_tuple.kwsVectors)
    #logging.debug("instance_x['p_kwsVectors'].shape : %s", np.array(instance_x["p_kwsVectors"]).shape)
    instance_x["p_mdcategories"] = MyUtils_strings.categories_to_vecs_lls(
        MyUtils_strings.fromlls_toarrays(prod_tuple.mdcategories), d2v_model)
    if len(np.array(instance_x["p_mdcategories"]).shape) >= 3:
        logging.debug("instance_x['p_mdcategories'].shape : %s",
                      np.array(instance_x["p_mdcategories"]).shape)
        instance_x["p_mdcategories"] = instance_x["p_mdcategories"][0]

    instance_x["q_questionVec"] = MyUtils_strings.fromstring_toarray(
        q_tuple.questionVec)
    instance_x["q_questionType"] = q_tuple.questionType
    instance_x["q_kwsVectors"] = MyUtils_strings.fromlls_toarrays(
        q_tuple.kwsVectors)

    instance_y = 1 if q_tuple.id[0:10] in prod_id else 0
    instance = namedtuple('instance', 'x y')
    inst = instance(x=instance_x, y=instance_y)

    return inst
Esempio n. 9
0
def train_NN(learning_rate = 0.01, max_epochs=1000, batch_size=32, dropout_rate=0, hiddenlayers_ls=None):

    MyUtils.init_logging("train_NN.log")
    hiddenlayers_ls_str = [str(num_elems) for num_elems in hiddenlayers_ls]
    tensorboard_dir_path = os.path.join(F.TENSORBOARD_ANN_DIR, 'trainingset_' + str(
        MyUtils_dbs.get_nn_dataset_length("train")),
                            "bs_" + str(batch_size),
                            "hls_" + "-".join(hiddenlayers_ls_str),
                            "lr_" + str(learning_rate),
                            "drop_" + str(dropout_rate) + "eps_" + str(max_epochs))
    if not os.path.exists(tensorboard_dir_path):
        os.makedirs(tensorboard_dir_path)
    MyUtils_filesystem.clean_directory(tensorboard_dir_path)

    tf.reset_default_graph()
    session = tf.Session()

    logging.info("Creating the placeholders for input and labels...")
    (input_placeholder, labels_placeholder) = NN.get_model_placeholders(batch_size)
    placeholders = (input_placeholder, labels_placeholder)

    logging.info("Connecting the loss computation and forward structure...")
    train_loss = NN.nn_loss_computation(logits=NN.nn_inference(input=input_placeholder, layers_hidden_units_ls=hiddenlayers_ls,
                                                               dropout_rate=dropout_rate),
                               labels=labels_placeholder)

    lrate_tensor = tf.placeholder(shape=[], dtype=tf.float32, name="lrate_tensor")


    ####### Defining the optimizer
    if str(learning_rate).lower() == MyUtils_flags.FLAG_ADAM:
        starting_lrate = MyUtils_flags.FLAG_ADAM
        optimizer = tf.train.AdamOptimizer()
    else:
        if str(learning_rate).lower() == MyUtils_flags.FLAG_RMSPROP:
            starting_lrate = MyUtils_flags.FLAG_RMSPROP
            optimizer = tf.train.RMSPropOptimizer(0.001)
        else:
            starting_lrate = learning_rate
            optimizer = tf.train.GradientDescentOptimizer(lrate_tensor)
            if str(learning_rate).lower() == MyUtils_flags.FLAG_CLR:
                _best_lr, min_lr, max_lr = CLR.find_cyclical_lrate_loop(placeholders, batch_size, hiddenlayers_ls,
                                                                        dropout_rate)

    # Summaries, and gathering information:
    train_loss_summary = tf.summary.scalar('Cross-entropy', train_loss)
    predictions = tf.argmax(tf.nn.softmax(logits=NN.nn_inference(input_placeholder, hiddenlayers_ls, dropout_rate)),
                            axis=1, name="predictions")

    tf_metric, tf_metric_update = tf.metrics.accuracy(labels=labels_placeholder, predictions=predictions,
                                                      name="accuracy")

    accuracy_summary = tf.summary.scalar('Accuracy', tf_metric_update)

    logging.info("Defining the optimizer's minimization task on the loss function...")
    minimizer_task = optimizer.minimize(train_loss)

    #Global variables are initialized after the graph structure
    tf.global_variables_initializer().run(session=session)

    #defining the tasks that will be run inside the training loop
    training_tasks = [minimizer_task, train_loss, predictions, tf_metric_update]
    validation_tasks = [tf_metric_update, predictions]
    validation_writing_tasks = [accuracy_summary]
    train_writing_tasks = [train_loss_summary, accuracy_summary]

    tasks_dictionary = {MyUtils_flags.FLAG_TRAIN_TASKS: training_tasks,
                        MyUtils_flags.FLAG_WRITING_TRAIN_TASKS: train_writing_tasks,
                        MyUtils_flags.FLAG_VALIDATION_TASKS: validation_tasks,
                        MyUtils_flags.FLAG_WRITING_VALIDATION_TASKS: validation_writing_tasks}

    #connection to the validation dataset
    valid_db_conn = sqlite3.connect(F.NN_VALID_INSTANCES_DB)
    valid_db_cursor = valid_db_conn.cursor()

    if str(learning_rate).lower() == MyUtils_flags.FLAG_CLR:
        CLR.training_loop_clr(tasks_dictionary, placeholders, batch_size,
                                  max_epochs, min_lr, max_lr, valid_db_cursor, tensorboard_dir_path)
    else:
        training_loop(tasks_dictionary, placeholders, starting_lrate,
                      batch_size, max_epochs, valid_db_cursor, tensorboard_dir_path, session)
def define_negative_examples(doc2vec_model, dataset_typeflag):
    MyUtils.init_logging("NN_Dataset_Instances-define_negative_examples.log",
                         logging.INFO)

    f = open(F.PRODS_WITH_NOTASKEDQUESTS_IDS, "w")
    f.close()
    prodsnegativeqs_outfile = open(F.PRODS_WITH_NOTASKEDQUESTS_IDS, "a")
    prodsnegativeqs_outfile.write("id_questionsNotAsked\n")

    ### Connect with the database to read from: candidate negative examples
    db_conn = sqlite3.connect(F.CANDIDATE_NEGQS_DB)
    c = db_conn.cursor()

    ### IF we are working to create the training dataset,
    ### then we before allowing a question Q asked for P2 to be a negative example for P1,
    ### we check the similarity between P1 and P2 (it must not be too high)
    if dataset_typeflag == MyUtils_flags.FLAG_TRAIN:

        ### Determining the maximum allowed similarity between products. Creates the similarity db if it does not exist
        if os.path.exists(F.SIMILARITY_PRODUCTS_DB) == True:
            p_sim_breakpoint = ES.get_products_similarity_breakpoint(
                fraction=0.97)
        else:
            p_sim_breakpoint = ES.explore_products_similarity(N=500,
                                                              fraction=0.97)

        ### Connect with the databases of product and questions representations, to be able to pick the products P1 and P2
        product_reps_dbconn = sqlite3.connect(F.PRODUCTS_FINAL_TRAIN_DB)
        product_reps_c = product_reps_dbconn.cursor()

    segment_size = 10**4
    for input_segment in pd.read_csv(F.PRODSWITHQUESTS_IDS,
                                     sep="_",
                                     chunksize=segment_size):
        for id_askedqs_t in input_segment.itertuples():
            prod_id = id_askedqs_t.id
            #logging.debug("Reading from F.PRODSWITHQUESTS_IDS, the product.id is: %s", prod_id)
            asked_qs = ast.literal_eval(id_askedqs_t.questionsAsked)
            t = (prod_id, )
            c.execute('SELECT * FROM prodnegatives WHERE prod_id=?', t)
            row = c.fetchone()
            if row is None:  #i.e. if the product in the file PRODSWITHQUESTS_IDS was excluded from the previous random subsampling
                continue
            candidatenegativeqs_rawstring = row[1]
            candidatenegativeqs_string = "[" + candidatenegativeqs_rawstring[:
                                                                             -1] + "]"

            candidatenegativeqs_ls = ast.literal_eval(
                candidatenegativeqs_string)
            candidatenegativeqs_ls1 = [
                q_id for q_id in candidatenegativeqs_ls if q_id not in asked_qs
            ]

            if dataset_typeflag == MyUtils_flags.FLAG_TRAIN:
                p1_row = MyUtils_dbs.search_in_alltables_db(
                    dbcursor=product_reps_c,
                    query_pretext="SELECT * FROM",
                    query_aftertext=" WHERE id='" + str(prod_id) + "'")[0]
                candidatenegativeqs_asins = list(
                    map(lambda q_id: q_id[0:10], candidatenegativeqs_ls1))

                p2_rows = MyUtils_dbs.search_in_alltables_db(
                    dbcursor=product_reps_c,
                    query_pretext="SELECT * FROM",
                    query_aftertext="WHERE id IN " +
                    str(tuple(candidatenegativeqs_asins)))
                qids_and_p2rows = list(zip(candidatenegativeqs_ls1, p2_rows))

                for q_id, p2_row in qids_and_p2rows:
                    #logging.debug("p1_row : %s", p1_row)
                    if p2_row is not None and len(p2_row) > 0:
                        #there are questions without corresponding products, in which case no similarity check is to be done

                        p1_tuple = MyUtils.prodls_tonamedtuple(p1_row)  #[1:]?
                        p2_tuple = MyUtils.prodls_tonamedtuple(p2_row)
                        p1_p2_sim, _simparts = PS.compute_2products_similarity_singleprocess(
                            prod1_tuple=p1_tuple,
                            prod2_tuple=p2_tuple,
                            d2v_model=doc2vec_model)
                        if p1_p2_sim > p_sim_breakpoint:
                            candidatenegativeqs_ls1.remove(q_id)
                            logging.info(
                                "Removing question from the candidate negative examples, "
                                +
                                "because the similarity between %s and %s is > %s",
                                prod_id, p2_tuple.id, p_sim_breakpoint)
                logging.info(
                    "Choosing negative examples: P-to-p similarity checks done for product: %s",
                    prod_id)

            random_indices = sorted(
                np.random.choice(a=range(len(candidatenegativeqs_ls1)),
                                 size=min(len(candidatenegativeqs_ls1),
                                          len(asked_qs)),
                                 replace=False,
                                 p=None))
            #logging.info(candidatenegativeqs_ls1)
            negativeqs_ls = [
                candidatenegativeqs_ls1[i] for i in random_indices
            ]
            #logging.info(negativeqs_ls)
            prodsnegativeqs_outfile.write(prod_id + "_" + str(negativeqs_ls) +
                                          "\n")

    prodsnegativeqs_outfile.close()
Esempio n. 11
0
def get_num_training_iterations(batch_size):
    trainset_length = MyUtils_dbs.get_nn_dataset_length(
        MyUtils_flags.FLAG_TRAIN)
    max_iter = trainset_length // batch_size  # in 1 epoch, you can not have more iterations than batches
    logging.info("Number of iterations per epoch: %s", max_iter)
    return max_iter
Esempio n. 12
0
def run_hedge(actions_ls=None,
              eta=None,
              max_T=None,
              balanced_instances=True,
              restart_candidates=True):
    MyUtils.init_logging("Hedge-run_hedge.log")

    ### initialization: either we use the single global balanced dataset, or the imbalanced category datasets
    if balanced_instances:
        dbs_paths = [(F.ONLINE_INSTANCEIDS_GLOBAL_DB,
                      F.PRODUCTS_FINAL_TRAIN_DB, F.QUESTIONS_FINAL_TRAIN_DB)]
    else:
        category_dirpaths = MyUtils_filesystem.get_category_dirpaths()
        dbs_paths = [
        ]  #list of tuples, with 3 elements: instancedb, products_db, qs_db
        for c_dir_path in category_dirpaths:
            for fname in os.listdir(c_dir_path):
                if "db" in fname:
                    if MyUtils_flags.FLAG_INSTANCEIDS in fname:
                        categ_instances_dbpath = os.path.join(
                            c_dir_path, fname)
                    elif MyUtils_flags.FLAG_PRODUCTS in fname:
                        categ_prods_dbpath = os.path.join(c_dir_path, fname)
                    else:
                        categ_qs_dbpath = os.path.join(c_dir_path, fname)
            dbs_paths.append(
                (categ_instances_dbpath, categ_prods_dbpath, categ_qs_dbpath))

    ### connecting with the database containing the candidates
    if balanced_instances:
        output_candidates_dbpath = F.CANDIDATES_ONLINE_BALANCED_DB
    else:
        output_candidates_dbpath = F.CANDIDATES_ONLINE_UNBALANCED_DB

    if restart_candidates:
        f = open(output_candidates_dbpath, "w")
        f.close()
    output_candidates_db = sqlite3.connect(output_candidates_dbpath)
    output_candidates_c = output_candidates_db.cursor()
    if restart_candidates:
        output_candidates_c.execute("""CREATE TABLE candidates (
                                        p_id varchar(63),
                                        q_id varchar(63)   )""")

    #For each dataset: connect to databases of instances, Ps, and Qs
    for (instances_dbpath, prods_dbpath, quests_dbpath) in dbs_paths:
        instances_db = sqlite3.connect(instances_dbpath)
        instances_ids_c = instances_db.cursor()
        prods_db = sqlite3.connect(prods_dbpath)
        ps_c = prods_db.cursor()
        quests_db = sqlite3.connect(quests_dbpath)
        qs_c = quests_db.cursor()

        chosen_dataset_name = os.path.basename(
            os.path.dirname(instances_dbpath))
        logging.info("Online Learning: operating on dataset: %s",
                     chosen_dataset_name)

        #### define the number of rounds
        if max_T is None:
            max_T = MyUtils_dbs.get_tot_num_rows_db(instances_ids_c)
            logging.info(
                "Total number of rounds (i.e. instances in the training set): %s",
                max_T)

        #### define the actions
        if actions_ls is None:
            if balanced_instances == False:
                actions_ls = ACD.get_actionsforcategories()
            else:
                actions_ls = AGD.get_actionsforbalanced()

        #### define the "learning rate"
        if eta is None:
            eta = np.sqrt((2 * np.log(len(actions_ls))) / max_T)

        #### output directory for Tensorboard logging
        results_dirpath = os.path.join(
            'OnlineLearning', 'Experiments_results', str(chosen_dataset_name),
            'numactions_' + str(len(actions_ls)), 'instances_' +
            str(max_T))  #datetime.datetime.today().strftime('%Y-%m-%d')
        if not os.path.exists(results_dirpath):
            os.makedirs(results_dirpath)
            MyUtils_filesystem.clean_directory(results_dirpath)
            #### the actual core of the algorithm
            hedge_loop(eta, max_T, actions_ls, instances_ids_c, ps_c, qs_c,
                       output_candidates_db, balanced_instances,
                       results_dirpath)
        else:
            logging.info("Online Learning results already computed for : %s",
                         results_dirpath)
Esempio n. 13
0
def attach_text_to_candidates(ranked_candidates_dbpath, prods_initial_dbpath, quests_initial_dbpath, prod_reps_dbpath, quest_reps_dbpath, final_outdb_path):
    MyUtils.init_logging("Attach_text_to_candidates.log")

    candidates_nn_db = sqlite3.connect(ranked_candidates_dbpath)
    cands_db_c = candidates_nn_db.cursor()
    f = open(F.RANKING_TEMP_DB, 'w'); f.close()
    temp_db = sqlite3.connect(F.RANKING_TEMP_DB)
    temp_db_c = temp_db.cursor()
    testprods_initial_c = sqlite3.connect(prods_initial_dbpath).cursor()
    testquests_initial_c = sqlite3.connect(quests_initial_dbpath).cursor()
    testprods_rep_c = sqlite3.connect(prod_reps_dbpath).cursor()
    testquests_rep_c = sqlite3.connect(quest_reps_dbpath).cursor()

    temp_db_c.execute('''CREATE TABLE candidates(  p_id varchar(63),
                                                   q_id varchar(63),
                                                   distance int,
                                                   p_titletext varchar(1023),
                                                   p_descriptiontext varchar(8191),
                                                   p_categorytext varchar (4095),
                                                   q_text varchar (8191)         
                                            )''')

    num_of_candidates = MyUtils_dbs.get_tot_num_rows_db(cands_db_c)
    logging.info(num_of_candidates)
    counter_questionsameid = 0
    last_prod_id = 'x'

    for rowindex in range(1, num_of_candidates + 1):
        row = cands_db_c.execute("SELECT * FROM candidates WHERE rowid = ?", (rowindex,)).fetchone()
        #logging.info("info: %s", row)
        prod_id = row[0]
        quest_id = row[1]
        distance = row[2]

        if last_prod_id != prod_id:
            product_titleinfo,product_descinfo, product_categinfo = \
                MyUtils_dbs.search_in_alltables_db(testprods_initial_c, "SELECT title, description, categories FROM",
                                                                  "WHERE asin = '" + str(prod_id) + "'")[0]
            product_representation = MyUtils_dbs.search_in_alltables_db(testprods_rep_c, "SELECT * FROM ",
                                                                        "WHERE id = '" + str(prod_id) + "'")[0]
            prod_tpl = MyUtils.prodls_tonamedtuple(product_representation, offset=1)

            counter_questionsameid = 0

        ###get question's unixTime
        if len(quest_id)< 21: #format : @nan0
            base_endpoint = 14
            question_unixTime = str(quest_id[11:base_endpoint])
        else:
            base_endpoint = 23
            question_unixTime = str(quest_id[11:base_endpoint])
        logging.debug("Question unixTime: %s", question_unixTime)

        if base_endpoint == 23: #if we have a valid unixTime specification

            possible_questions_text = MyUtils_dbs.search_in_alltables_db(testquests_initial_c, "SELECT question FROM",
                                                                  "WHERE asin = '" + str(quest_id[0:10]) + "'"
                                                              + " AND unixTime LIKE '" + question_unixTime + "%'")
        else: #if we have NULL in the unixTime field
            possible_questions_text = MyUtils_dbs.search_in_alltables_db(testquests_initial_c, "SELECT question FROM",
                                                                         "WHERE asin = '" + str(quest_id[0:10]) + "'"
                                                                         + " AND unixTime IS NULL")
        base_q_id = str(quest_id[0:23])
        possible_questions_reps = MyUtils_dbs.search_in_alltables_db(testquests_rep_c, "SELECT * FROM ",
                                                             "WHERE id LIKE '" + str(base_q_id) + "%'")
        logging.debug("possible_questions_reps: %s", possible_questions_reps)
        logging.debug("possible_questions_text:%s", possible_questions_text)

        if len(possible_questions_text) > 1:
            possible_questions_tuples = list(map ( lambda q_ls : MyUtils.quest_lstonamedtuple(q_ls, offset=1), possible_questions_reps))
            possible_questions_distances = list(map (lambda q_tpl : CD.compute_dist_pq(prod_tpl, q_tpl) , possible_questions_tuples))

            qs_dist_lts = list(zip(possible_questions_tuples, possible_questions_distances))
            qs_dist_lts_sorted = sorted( qs_dist_lts, key=lambda tpl : tpl[1])
            #logging.info("sorted question tuples: %s", qs_dist_lts_sorted)
            question_textinfo = possible_questions_text[counter_questionsameid][0]
            counter_questionsameid= counter_questionsameid+1
        else:
            question_textinfo = possible_questions_text[0][0]
        logging.debug("question_textinfo: %s", question_textinfo)

        temp_db_c.execute("INSERT INTO candidates VALUES (?,?,?,?,?,?,?)", (prod_id, quest_id, distance,
                                                                          product_titleinfo, product_descinfo, product_categinfo, question_textinfo))
        logging.debug("***")

    temp_db.commit()
    os.rename(F.RANKING_TEMP_DB , final_outdb_path)
Esempio n. 14
0
def find_cyclical_lrate_loop(placeholders,
                             batch_size,
                             hiddenlayers_ls,
                             drop_rate,
                             lrate_start=10**(-7),
                             lrate_end=0.2):
    trainset_length = MyUtils_dbs.get_nn_dataset_length(
        MyUtils_flags.FLAG_TRAIN)
    max_iter = trainset_length // batch_size  # in 1 epoch, you can not have more iterations than batches

    hiddenlayers_ls_str = [str(num_elems) for num_elems in hiddenlayers_ls]
    tensorboard_dir_path = os.path.join(
        F.TENSORBOARD_ANN_DIR,
        'trainingset_' + str(MyUtils_dbs.get_nn_dataset_length("train")),
        "bs_" + str(batch_size), "hls_" + "-".join(hiddenlayers_ls_str),
        "lr_clr", "drop_" + str(drop_rate) + "_explore")
    if not os.path.exists(tensorboard_dir_path):
        os.makedirs(tensorboard_dir_path)

    session = tf.Session(
    )  # separate session for trying to find the optimal l.r. for the Cyclical Learning Rate
    logging.info("*** Session: Cyclical Learning Rate")
    (input_placeholder, labels_placeholder) = placeholders

    train_loss = NN.nn_loss_computation(logits=NN.nn_inference(
        input=input_placeholder,
        layers_hidden_units_ls=hiddenlayers_ls,
        dropout_rate=drop_rate),
                                        labels=labels_placeholder)
    #train_loss_summary = tf.summary.scalar('Cross-entropy', train_loss)

    predictions = tf.argmax(tf.nn.softmax(
        logits=NN.nn_inference(input_placeholder, hiddenlayers_ls, drop_rate)),
                            axis=1)
    #tf_metric, tf_metric_update = tf.metrics.accuracy(labels=labels_placeholder, predictions=predictions,
    #                                                  name="CLR_train_accuracy")
    running_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES,
                                     scope="CLR_train_accuracy")
    #train_accuracy_summary = tf.summary.scalar('Accuracy', tf_metric_update)

    lrate_tensor = tf.placeholder(shape=[],
                                  dtype=tf.float32,
                                  name="lrate_tensor")
    optimizer = tf.train.GradientDescentOptimizer(lrate_tensor)
    minimizer_task = optimizer.minimize(train_loss)

    # Global variables are initialized after the graph structure
    tf.global_variables_initializer().run(session=session)

    running_vars_initializer = tf.variables_initializer(var_list=running_vars)
    session.run(running_vars_initializer)

    logging.info(
        "Number of iterations per epoch (and linear steps in the search for the learning rate): %s",
        max_iter)
    lrate_increase = (lrate_end - lrate_start) / max_iter
    logging.info(
        "Step increase of the learning rate in the exploration epochs: %s",
        round(lrate_increase, 7))
    trial_epochs = 5
    loss_matrix = np.zeros((trial_epochs, max_iter))
    accuracy_matrix = np.zeros((trial_epochs, max_iter))
    for i in range(1, trial_epochs + 1):
        start_epoch_time = time()
        logging.info(
            "Search for the base learning rate; Starting training epoch n. %s",
            i)
        batch_generator = NN.generator_of_batches(batch_size,
                                                  MyUtils_flags.FLAG_TRAIN)

        # Train, in the current epoch
        for j in range(0, max_iter):
            session.run(running_vars_initializer
                        )  #new batch: re-initializing the accuracy computation
            batch = \
                batch_generator.__next__()
            current_iteration_feed_dict = NN.fill_feed_dict(
                batch, input_placeholder, labels_placeholder)
            learning_rate = lrate_start + lrate_increase * j
            current_iteration_feed_dict.update({lrate_tensor: learning_rate})

            if j % (max_iter // 20) == 0:
                logging.info("Iteration: %s on %s .", j, max_iter)
            _, current_loss, b_predictions, b_labels = session.run(
                [minimizer_task, train_loss, predictions, labels_placeholder],
                feed_dict=current_iteration_feed_dict)
            loss_matrix[i - 1][j] = current_loss
            accuracy_matrix[i - 1][j] = get_batch_accuracy(
                b_predictions, b_labels)

        end_epoch_time = time()
        logging.info(
            "Searching for the base values for the cyclical learning rate. " +
            "Training on epoch %s executed. Time elapsed: %s", i,
            round(end_epoch_time - start_epoch_time, 3))

    best_lr, min_lr, max_lr = pick_lr_boundaries(loss_matrix, lrate_start,
                                                 lrate_increase)
    session.close()

    #write the lr to a logfile
    lrfile = open(os.path.join(tensorboard_dir_path, "found_lr.log"), "w")
    lrfile.write(
        "Cyclical Learning rate: applying the LR test on " +
        str(trial_epochs) + "epochs ;\n " +
        "the average learning rate granting the steepest descent of the loss function is: "
        + str(best_lr))
    lrfile.close()

    return best_lr, min_lr, max_lr