Ejemplo n.º 1
0
def decode_query_binary(query_embedding, threshold=0.5, num_words=None):
    tokens_missing = False  # only if at least one token is missing
    decoded = bae_decoder.predict(query_embedding)
    query_terms = []
    indices = []
    if num_words is None:  # in this case we use the threshold parameter
        decoded = normalize_to_01_range(decoded)  # normalize to [0,1] range
        _, indices = np.where(decoded > threshold)
    elif num_words:  # otherwise we use this guy
        indices = decoded[0].argsort()[-num_words:][::1]
    if emode == "index":
        # construct bin vector
        bin_code_vec = [0] * len(decoded[0])
        for idx in indices:
            bin_code_vec[idx] = 1
        # construct int vector with indices
        indices = DECODE(bin_code_vec)
    # reverse indices into words
    for index in indices:
        if index == 0:  # reserved for empty buckets
            continue
        if index in inv_vocab:
            term = inv_vocab[index]
            query_terms.append(term)
        else:
            #print("Warn: index %s not found", str(index))
            tokens_missing = True
    reconstructed_query = " ".join(query_terms)
    return decoded, reconstructed_query, tokens_missing
Ejemplo n.º 2
0
def ask(query1, query2, threshold=0.5):
    query1_vec = vectorizer.get_vector_for_tuple(query1)
    query2_vec = vectorizer.get_vector_for_tuple(query2)

    q1_fab = normalizeFVector.normalize_function(query1_vec)[0]
    q2_fab = normalizeFVector.normalize_function(query2_vec)[0]
    answer_coded = fqa.predict_f(fqa_model, q1_fab, q2_fab)  # this is the model not the function

    # now we need to make sure that output looks binary
    answer_coded_norm = normalize_to_01_range(answer_coded)
    indices = np.where(answer_coded_norm > threshold)
    answer_coded.fill(0)
    answer_coded[indices] = 1

    # Make sure these are ints
    answer_coded = np.asarray(answer_coded, dtype='int32')[0]

    # now we need to decode the now binary vector into index integers
    index_indices = DECODE(answer_coded)

    answer_tokens = []
    for idx in index_indices:
        if idx == 0:  # 0 is reserved for empty buckets
            continue
        answer_tokens.append(inv_vocab[idx])
    answer = ' '.join(answer_tokens)
    return answer
Ejemplo n.º 3
0
def get_tokens_from_bin_vector(x):
    tokens = set()
    indices = DECODE(x)
    for index in indices:
        if index == 0:  # reserved for empty buckets
            continue
        term = fabric_api.inv_vocab[index]
        tokens.add(term)
    return tokens
Ejemplo n.º 4
0
 def get_query_from_code(code):
     words = set()
     int_code = DECODE(code)
     for el in int_code:
         reconstructed_idx = el
         if reconstructed_idx != 0:
             if reconstructed_idx in inv_vocab_index:
                 word = inv_vocab_index[reconstructed_idx]
                 words.add(word)
     return words
Ejemplo n.º 5
0
def decode_similar_query(query_embedding, num_output):
    decoded = bae_decoder.predict(query_embedding)
    top99 = np.percentile(decoded, 99)
    top98 = np.percentile(decoded, 98)
    top97 = np.percentile(decoded, 97)
    top96 = np.percentile(decoded, 96)
    top95 = np.percentile(decoded, 95)
    top93 = np.percentile(decoded, 93)
    top90 = np.percentile(decoded, 90)

    _, indices99 = np.where(decoded > top99)
    _, indices98 = np.where(decoded > top98)
    _, indices97 = np.where(decoded > top97)
    _, indices96 = np.where(decoded > top96)
    _, indices95 = np.where(decoded > top95)
    _, indices93 = np.where(decoded > top93)
    _, indices90 = np.where(decoded > top90)
    list_of_indices = [indices99, indices98, indices97, indices96, indices95, indices93, indices90]

    recons_queries = []

    for indices in list_of_indices:
        query_terms = []
        if emode == "index":
            # construct bin vector
            bin_code_vec = [0] * len(decoded[0])
            for idx in indices:
                bin_code_vec[idx] = 1
            # construct int vector with indices
            indices = DECODE(bin_code_vec)
        # reverse indices into words
        for index in indices:
            if index == 0:  # reserved for empty buckets
                continue
            try:
                term = inv_vocab[index]
            except KeyError:
                # FIXME: invalid term, just skip for now -> optimistic
                continue
            query_terms.append(term)
        reconstructed_query = " ".join(query_terms)
        recons_queries.append(reconstructed_query)
    return recons_queries
Ejemplo n.º 6
0
def decode_query(query_embedding, threshold=0.5, num_words=None):
    decoded = decoder.predict(query_embedding)
    query_terms = []
    indices = []
    if num_words is None:  # in this case we use the threshold parameter
        decoded = normalize_to_01_range(decoded)  # normalize to [0,1] range
        _, indices = np.where(decoded > threshold)
    elif num_words:  # otherwise we use this guy
        indices = decoded[0].argsort()[-num_words:][::1]
    if emode == "index":
        # construct bin vector
        bin_code_vec = [0] * len(decoded[0])
        for idx in indices:
            bin_code_vec[idx] = 1
        # construct int vector with indices
        indices = DECODE(bin_code_vec)
    # reverse indices into words
    for index in indices:
        if index == 0:  # reserved for empty buckets
            continue
        term = inv_vocab[index]
        query_terms.append(term)
    reconstructed_query = " ".join(query_terms)
    return decoded, reconstructed_query
Ejemplo n.º 7
0
def test_index_based_coding():

    vocab = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
    vocab_index = dict()
    inv_vocab_index = dict()
    i = 1
    for l in vocab:
        spaced_id = i * 10
        vocab_index[l] = spaced_id
        inv_vocab_index[spaced_id] = l
        i += 1
    print("Vocab size: " + str(len(vocab)))

    code_dim_int = 8
    code_dim = code_dim_int * 32

    words = []
    for combination in itertools.combinations(vocab, 3):
        word = ' '.join(combination)
        words.append(word)
    print("queries from vocab: " + str(len(words)))

    float_embedding_factor = 1

    training_data = []
    for w in words:
        code_vector = np.asarray([0] * code_dim_int, dtype=np.int32)
        tokens = w.split(' ')
        # obtain list of candidate indices for each token
        for t in tokens:
            #idx = hash_this(t) % code_dim_int  # hashing only once this time
            indices = get_hash_indices(t, code_dim_int)
            #print(str(indices))
            for idx in indices:
                if code_vector[idx] == 0:
                    code_vector[idx] = vocab_index[t]  #/ float_embedding_factor
                    continue
        set_tokens = set()
        for t in tokens:
            set_tokens.add(t)
        bin_code_vector = CODE(code_vector)
        training_data.append((set_tokens, bin_code_vector))

    hits = 0
    for tokens, vec in training_data:
        vec = DECODE(vec)
        reconstructed_word = set()
        ids = set()
        for el in vec:
            if el != 0:
                ids.add(el)
        for id in ids:
            # id = round(id * float_embedding_factor)
            word = inv_vocab_index[id]
            reconstructed_word.add(word)
        if tokens != reconstructed_word:
            print(str(tokens))
            print(str(reconstructed_word))
        else:
            hits += 1
    ratio_hit = float(hits/len(training_data))
    print("Ratio hit before learning: " + str(ratio_hit))

    #exit()

    from architectures import autoencoder as ae

    model = ae.declare_model(code_dim, 8)  # 16d and 1000e // 8d 3000e
    model = ae.compile_model(model)

    # reshape training data
    training_data = [t for _, t in training_data]

    # X = np.asarray([training_data[0]])
    X = [training_data[0]]

    # print(X)

    for t in training_data[1:]:
        # npt = np.asarray([t])
        X.append(t)
        # X = np.concatenate((npt, X))
        # X[0].append(np.asarray(t))

    model = ae.train_model(model, np.asarray(X), epochs=3000, batch_size=2)

    encoder = ae.encoder
    decoder = ae.decoder

    def get_query_from_code(code):
        words = set()
        int_code = DECODE(code)
        for el in int_code:
            reconstructed_idx = el
            if reconstructed_idx != 0:
                if reconstructed_idx in inv_vocab_index:
                    word = inv_vocab_index[reconstructed_idx]
                    words.add(word)
        return words

    totals = len(training_data)
    hits = 0
    hits_top2 = 0
    for t in training_data:
        input = np.asarray([t])
        number_of_ones = len(np.where(input[0] == 1)[0])
        original = get_query_from_code(input[0])
        #original_hot = input.argmax()
        encoded = encoder.predict(input)
        #print(str(encoded))
        decoded = decoder.predict(encoded)

        indices = decoded[0].argsort()[-number_of_ones:][::1]
        decoded_bin = [0] * code_dim_int * 32
        for idx in indices:
            decoded_bin[idx] = 1

        # avg = np.average(decoded[0])
        # decoded_bin = []
        # for el in decoded[0]:
        #     if el > avg:
        #         decoded_bin.append(1)
        #     else:
        #         decoded_bin.append(0)
        # print("O-decoded: " + str(decoded[0]))


        recon = get_query_from_code(decoded_bin)
        if original == recon:
            hits += 1
        else:
            print("I:" + str(original))
            print("O:" + str(recon))
        #output = decoded[0].argsort()
        #output = output[-2:][::-1]
        #print(str(original_hot) + " -- " + str(output[0]))
        #print(str(original_hot) + " -- " + str(output))
        #if original_hot == output[0]:
        #    hits += 1
        #if original_hot in output:
        #    hits_top2 += 1
    ratio = hits / totals
    ratio_top = hits_top2 / totals
    print("HITS: " + str(ratio))
    print("HITS-top2: " + str(ratio_top))
Ejemplo n.º 8
0
def test_random_mapping():
    import random
    # learning to sum
    training_data = []
    for i in range(1000):
        x1 = [random.randint(0, 50)]
        x2 = [random.randint(0, 50)]
        y = [random.randint(0, 50)]
        training_data.append((x1, x2, y))
        print(str(x1) + " + " + str(x2) + " = " + str(y))

    bin_training_data = []
    for x1, x2, y in training_data:
        x1_bin = CODE(x1)
        x2_bin = CODE(x2)
        y_bin = CODE(y)
        bin_training_data.append((x1_bin, x2_bin, y_bin))
        print(str(x1_bin) + " + " + str(x2_bin) + " = " + str(y_bin))

    model = declare_model(32)
    model = compile_model(model)

    def generator_of_data(training_data, batch_size):
        while 1:
            batch_x1 = []
            batch_x2 = []
            batch_y = []
            for i in range(len(training_data)):
                if i == len(training_data):
                    continue
                x1, x2, y = training_data[i]
                batch_x1.append(x1)
                batch_x2.append(x2)
                batch_y.append(y)
                if len(batch_x1) == batch_size:
                    yield [np.asarray(batch_x1),
                           np.asarray(batch_x2)], np.asarray(batch_y)
                    batch_x1.clear()
                    batch_x2.clear()
                    batch_y.clear()

    import time
    stime = time.time()
    model = train_model_incremental(model,
                                    generator_of_data(bin_training_data, 2),
                                    steps_per_epoch=500,
                                    epochs=100)
    etime = time.time()

    hits = 0
    for x1, x2, y in bin_training_data:
        x1 = np.asarray([x1])
        x2 = np.asarray([x2])
        output = model.predict([x1, x2])
        threshold = 0.4
        normalized_output = normalize_to_01_range(output)
        ones = np.where(normalized_output > threshold)[1]
        bin_code = [0] * 32
        for one in ones:
            bin_code[one] = 1
        gt = DECODE(y)
        result = DECODE(bin_code)
        if gt == result:
            hits += 1
        else:
            print(normalized_output)
        print(str(gt) + " -> " + str(result))
    print("Hit ratio-training: " + str(float(hits / len(bin_training_data))))
    print("Total training time: " + str(etime - stime))
Ejemplo n.º 9
0
def test_learn_to_sum():

    import random
    "learning to sum"
    training_data = []
    training_data_idx = []
    for i in range(1000):
        rnd = random.randint(0, 999)
        training_data_idx.append(rnd)
        x1 = [rnd]
        x2 = [rnd + 3]
        y = [2 * rnd + 3]  # addition
        training_data.append((x1, x2, y))
        print(str(x1) + " + " + str(x2) + " = " + str(y))

    bin_training_data = []
    for x1, x2, y in training_data:
        x1_bin = CODE(x1)
        x2_bin = CODE(x2)
        y_bin = CODE(y)
        bin_training_data.append((x1_bin, x2_bin, y_bin))
        print(str(x1_bin) + " + " + str(x2_bin) + " = " + str(y_bin))

    model = declare_model(32)
    model = compile_model(model)

    def generator_of_data(training_data, batch_size):
        while 1:
            batch_x1 = []
            batch_x2 = []
            batch_y = []
            for i in range(len(training_data)):
                if i == len(training_data):
                    continue
                x1, x2, y = training_data[i]
                batch_x1.append(x1)
                batch_x2.append(x2)
                batch_y.append(y)
                if len(batch_x1) == batch_size:
                    yield [np.asarray(batch_x1),
                           np.asarray(batch_x2)], np.asarray(batch_y)
                    batch_x1.clear()
                    batch_x2.clear()
                    batch_y.clear()

    trained_model = train_model_incremental(model,
                                            generator_of_data(
                                                bin_training_data, 4),
                                            steps_per_epoch=250,
                                            epochs=100)

    x1_test = []
    x2_test = []
    y_test = []
    for x1, x2, y in bin_training_data:
        x1_test.append(x1)
        x2_test.append(x2)
        y_test.append(y)

    score = evaluate_model(trained_model, np.asarray(x1_test),
                           np.asarray(x2_test), np.asarray(y_test))
    print("Final score: " + str(score))

    hits = 0
    for x1, x2, y in bin_training_data:
        x1 = np.asarray([x1])
        x2 = np.asarray([x2])
        total_ones_input = len(np.where(x1 == 0)[0]) + len(
            np.where(x2 == 1)[0])
        output = model.predict([x1, x2])
        # avg = np.average(output)
        # std = np.std(output)
        # threshold = avg + std
        threshold = 0.4
        normalized_output = normalize_to_01_range(output)
        # threshold = np.percentile(output, 90)
        ones = np.where(normalized_output > threshold)[1]
        # ones = output[0].argsort()[-total_ones_input:][::1]
        bin_code = [0] * 32
        for one in ones:
            bin_code[one] = 1
        gt = DECODE(y)
        result = DECODE(bin_code)
        if gt == result:
            hits += 1
        else:
            print(normalized_output)
        print(str(gt) + " -> " + str(result))
    print("Hit ratio-training: " + str(float(hits / len(bin_training_data))))

    hits = 0
    test_samples = 0
    for i in range(1000):
        if i not in training_data_idx:
            test_samples += 1
            x1 = [i]
            x2 = [i + 3]
            y = [2 * i + 3]
            x1_bin = CODE(x1)
            x2_bin = CODE(x2)
            y_bin = CODE(y)
            output = model.predict(
                [np.asarray([x1_bin]),
                 np.asarray([x2_bin])])
            threshold = 0.4
            normalized_output = normalize_to_01_range(output)
            ones = np.where(normalized_output > threshold)[1]
            bin_code = [0] * 32
            for one in ones:
                bin_code[one] = 1
            gt = DECODE(y_bin)
            result = DECODE(bin_code)
            if gt == result:
                hits += 1
            else:
                print(normalized_output)
            print(str(gt) + " -> " + str(result))
    print("Hit ratio-test: " + str(float(hits / test_samples)))