Esempio n. 1
0
def add_batch_to_database(word_batch, pq_quantization, con, cur, index_config,
                          batch_size, logger):
    values = []
    for i in range(len(pq_quantization)):
        output_vec = utils.serialize_vector(pq_quantization[i][1])
        values.append({
            "coarse_id": str(pq_quantization[i][0]),
            "word": word_batch[i][:100],
            "vector": output_vec
        })
        if (i % (batch_size - 1) == 0) or (i == (len(pq_quantization) - 1)):
            if USE_BYTEA_TYPE:
                cur.executemany(
                    "INSERT INTO " +
                    index_config.get_value('fine_table_name') +
                    " (coarse_id, word,vector) VALUES (%(coarse_id)s, %(word)s, vec_to_bytea(%(vector)s::int2[]))",
                    tuple(values))
            else:
                cur.executemany(
                    "INSERT INTO " +
                    index_config.get_value('fine_table_name') +
                    " (coarse_id, word,vector) VALUES (%(coarse_id)s, %(word)s, %(vector)s)",
                    tuple(values))
            con.commit()
            values = []
    return
Esempio n. 2
0
def add_to_database(words, cq, codebook, pq_quantization, coarse_counts,
                    fine_counts, con, cur, index_config, batch_size, logger):
    # add codebook
    db_export.add_codebook_to_database(codebook, fine_counts, con, cur,
                                       index_config)

    # add coarse quantization
    db_export.add_cq_to_database(cq, coarse_counts, con, cur, index_config)

    # add fine qunatization
    values = []
    for i in range(len(pq_quantization)):
        output_vec = utils.serialize_vector(pq_quantization[i][1])
        values.append({
            "coarse_id": str(pq_quantization[i][0]),
            "word": words[i][:100],
            "vector": output_vec
        })
        if (i % (batch_size - 1) == 0) or (i == (len(pq_quantization) - 1)):
            if USE_BYTEA_TYPE:
                cur.executemany(
                    "INSERT INTO " +
                    index_config.get_value('fine_table_name') +
                    " (coarse_id, word,vector) VALUES (%(coarse_id)s, %(word)s, vec_to_bytea(%(vector)s::int2[]))",
                    tuple(values))
            else:
                cur.executemany(
                    "INSERT INTO " +
                    index_config.get_value('fine_table_name') +
                    " (coarse_id, word,vector) VALUES (%(coarse_id)s, %(word)s, %(vector)s)",
                    tuple(values))
            con.commit()
            logger.log(Logger.INFO, 'Inserted ' + str(i + 1) + ' vectors')
            values = []
    return
Esempio n. 3
0
def add_codebook_to_database(codebook, counts, con, cur, index_config):
    for pos in range(len(codebook)):
        values = []
        for i in range(len(codebook[pos])):
            output_vec = utils.serialize_vector(codebook[pos][i])
            values.append({"pos": pos, "code": i, "vector": output_vec, "count": counts[(pos, i)]})
        if USE_BYTEA_TYPE:
            cur.executemany("INSERT INTO "+ index_config.get_value("cb_table_name") + " (pos,code,vector, count) VALUES (%(pos)s, %(code)s, vec_to_bytea(%(vector)s::float4[]), %(count)s)", tuple(values))
        else:
            cur.executemany("INSERT INTO "+ index_config.get_value("cb_table_name") + " (pos,code,vector, count) VALUES (%(pos)s, %(code)s, %(vector)s, %(count)s)", tuple(values))
        con.commit()
    return
Esempio n. 4
0
def add_cq_to_database(cq, coarse_counts, con, cur, index_config):
    # add coarse quantization
    values = []
    for i in range(len(cq)):#
        output_vec = utils.serialize_vector(cq[i])
        count = coarse_counts[i] if i in coarse_counts else 0
        values.append({"id": i, "vector": output_vec, "count": count})
    if USE_BYTEA_TYPE:
        cur.executemany("INSERT INTO " + index_config.get_value('coarse_table_name') + " (id, vector, count) VALUES (%(id)s, vec_to_bytea(%(vector)s::float4[]), %(count)s)", tuple(values))
    else:
        cur.executemany("INSERT INTO " + index_config.get_value('coarse_table_name') + " (id, vector, count) VALUES (%(id)s, %(vector)s, %(count)s)", tuple(values))
    con.commit()
    return
Esempio n. 5
0
def add_to_database(words, cq, codebook, pq_quantization, coarse_counts, \
    fine_counts, con, cur, index_config, batch_size, logger):
    # add codebook
    db_export.add_codebook_to_database(codebook, fine_counts, con, cur,
                                       index_config)

    # add coarse quantization
    if COARSE_TYPE == 'MULTI_INDEX':
        db_export.add_multi_cq_to_database(cq, coarse_counts, con, cur,
                                           index_config)
    else:
        db_export.add_cq_to_database(cq, coarse_counts, con, cur, index_config)

    # add fine qunatization
    values = []
    coarse_id_column = 'coarse_id'
    for i in range(len(pq_quantization)):
        output_vec = utils.serialize_vector(pq_quantization[i][1])
        # print('pq_quantization[i]', pq_quantization[i])
        coarse_id = None
        if COARSE_TYPE == 'MULTI_INDEX':
            coarse_id = str(
                combine_centroids(pq_quantization[i][0],
                                  index_config.get_value('k_coarse')))
        else:
            coarse_id = str(pq_quantization[i][0])
        value_entry = {
            "id": i + 1,
            "vector": output_vec,
            "coarse_id": coarse_id
        }
        values.append(value_entry)
        if (i % (batch_size - 1) == 0) or (i == (len(pq_quantization) - 1)):
            if USE_BYTEA_TYPE:
                query = "INSERT INTO "+ index_config.get_value('fine_table_name') + \
                    " (" + coarse_id_column + ", id,vector) VALUES (" + \
                    '%(coarse_id)s' + ", %(id)s, vec_to_bytea(%(vector)s::int2[]))"
                cur.executemany(query, tuple(values))
            else:
                query = "INSERT INTO "+ index_config.get_value('fine_table_name') + \
                    " (" + coarse_id_column + ", id,vector) VALUES ("+ \
                    '%(coarse_id)s' + ", %(id)s, %(vector)s)"
                cur.executemany(query, tuple(values))
            con.commit()
            logger.log(Logger.INFO, 'Inserted ' + str(i + 1) + ' vectors')
            values = []
    return
Esempio n. 6
0
def add_multi_cq_to_database(cq, coarse_counts, con, cur, index_config):
    BATCH_SIZE = 100
    m = len(cq)
    num_centr = index_config.get_value('k_coarse')

    # add quantizer
    for pos in range(len(cq)):
        values = []
        for i in range(len(cq[pos])):
            output_vec = utils.serialize_vector(cq[pos][i])
            values.append({"pos": pos, "code": i, "vector": output_vec})
        if USE_BYTEA_TYPE:
            cur.executemany(
                "INSERT INTO " + index_config.get_value('coarse_table_name') +
                " (pos,code,vector) VALUES (%(pos)s, %(code)s, vec_to_bytea(%(vector)s::float4[]))",
                tuple(values))
        else:
            cur.executemany(
                "INSERT INTO " + index_config.get_value('coarse_table_name') +
                " (pos,code,vector) VALUES (%(pos)s, %(code)s, %(vector)s)",
                tuple(values))
        con.commit()

    # add counts
    divide_code = lambda code, units, length: tuple(
        [int((code / units**i) % units)
         for i in range(length)])  # devides code into centroid ids
    batch = []
    for code in range(num_centr**m):
        key = divide_code(code, num_centr, m)
        count = coarse_counts[key] if key in coarse_counts else 0
        batch.append({"id": code, "count": count})
        if code % BATCH_SIZE == 0:
            cur.executemany(
                "INSERT INTO " + index_config.get_value('coarse_table_name') +
                "_counts" + " (id, count) VALUES (%(id)s, %(count)s)",
                tuple(batch))
            con.commit()
            batch = []
    cur.executemany(
        "INSERT INTO " + index_config.get_value('coarse_table_name') +
        "_counts" + " (id, count) VALUES (%(id)s, %(count)s)", tuple(batch))
    con.commit()
    return
Esempio n. 7
0
def add_to_database(words, codebook, pq_quantization, counts, con, cur, index_config, batch_size, logger):
    logger.log(Logger.INFO, 'Length of words: ' + str(len(words)) + ' Length of pq_quantization: ' + str(len(pq_quantization)))
    # add codebook
    add_codebook_to_database(codebook, counts, con, cur, index_config)

    # add pq qunatization
    values = []
    for i in range(len(pq_quantization)):
        output_vec = utils.serialize_vector(pq_quantization[i])
        values.append({"word": words[i][:100], "vector": output_vec})
        if (i % (batch_size-1) == 0) or (i == (len(pq_quantization)-1)):
            if USE_BYTEA_TYPE:
                cur.executemany("INSERT INTO "+ index_config.get_value("pq_table_name") + " (word,vector) VALUES (%(word)s, vec_to_bytea(%(vector)s::int2[]))", tuple(values))
            else:
                cur.executemany("INSERT INTO "+ index_config.get_value("pq_table_name") + " (word,vector) VALUES (%(word)s, %(vector)s)", tuple(values))
            con.commit()
            logger.log(Logger.INFO, 'Inserted ' + str(i+1) + ' vectors')
            values = []
    return