def decode_query_binary(query_embedding, threshold=0.5, num_words=None): tokens_missing = False # only if at least one token is missing decoded = bae_decoder.predict(query_embedding) query_terms = [] indices = [] if num_words is None: # in this case we use the threshold parameter decoded = normalize_to_01_range(decoded) # normalize to [0,1] range _, indices = np.where(decoded > threshold) elif num_words: # otherwise we use this guy indices = decoded[0].argsort()[-num_words:][::1] if emode == "index": # construct bin vector bin_code_vec = [0] * len(decoded[0]) for idx in indices: bin_code_vec[idx] = 1 # construct int vector with indices indices = DECODE(bin_code_vec) # reverse indices into words for index in indices: if index == 0: # reserved for empty buckets continue if index in inv_vocab: term = inv_vocab[index] query_terms.append(term) else: #print("Warn: index %s not found", str(index)) tokens_missing = True reconstructed_query = " ".join(query_terms) return decoded, reconstructed_query, tokens_missing
def ask(query1, query2, threshold=0.5): query1_vec = vectorizer.get_vector_for_tuple(query1) query2_vec = vectorizer.get_vector_for_tuple(query2) q1_fab = normalizeFVector.normalize_function(query1_vec)[0] q2_fab = normalizeFVector.normalize_function(query2_vec)[0] answer_coded = fqa.predict_f(fqa_model, q1_fab, q2_fab) # this is the model not the function # now we need to make sure that output looks binary answer_coded_norm = normalize_to_01_range(answer_coded) indices = np.where(answer_coded_norm > threshold) answer_coded.fill(0) answer_coded[indices] = 1 # Make sure these are ints answer_coded = np.asarray(answer_coded, dtype='int32')[0] # now we need to decode the now binary vector into index integers index_indices = DECODE(answer_coded) answer_tokens = [] for idx in index_indices: if idx == 0: # 0 is reserved for empty buckets continue answer_tokens.append(inv_vocab[idx]) answer = ' '.join(answer_tokens) return answer
def get_tokens_from_bin_vector(x): tokens = set() indices = DECODE(x) for index in indices: if index == 0: # reserved for empty buckets continue term = fabric_api.inv_vocab[index] tokens.add(term) return tokens
def get_query_from_code(code): words = set() int_code = DECODE(code) for el in int_code: reconstructed_idx = el if reconstructed_idx != 0: if reconstructed_idx in inv_vocab_index: word = inv_vocab_index[reconstructed_idx] words.add(word) return words
def decode_similar_query(query_embedding, num_output): decoded = bae_decoder.predict(query_embedding) top99 = np.percentile(decoded, 99) top98 = np.percentile(decoded, 98) top97 = np.percentile(decoded, 97) top96 = np.percentile(decoded, 96) top95 = np.percentile(decoded, 95) top93 = np.percentile(decoded, 93) top90 = np.percentile(decoded, 90) _, indices99 = np.where(decoded > top99) _, indices98 = np.where(decoded > top98) _, indices97 = np.where(decoded > top97) _, indices96 = np.where(decoded > top96) _, indices95 = np.where(decoded > top95) _, indices93 = np.where(decoded > top93) _, indices90 = np.where(decoded > top90) list_of_indices = [indices99, indices98, indices97, indices96, indices95, indices93, indices90] recons_queries = [] for indices in list_of_indices: query_terms = [] if emode == "index": # construct bin vector bin_code_vec = [0] * len(decoded[0]) for idx in indices: bin_code_vec[idx] = 1 # construct int vector with indices indices = DECODE(bin_code_vec) # reverse indices into words for index in indices: if index == 0: # reserved for empty buckets continue try: term = inv_vocab[index] except KeyError: # FIXME: invalid term, just skip for now -> optimistic continue query_terms.append(term) reconstructed_query = " ".join(query_terms) recons_queries.append(reconstructed_query) return recons_queries
def decode_query(query_embedding, threshold=0.5, num_words=None): decoded = decoder.predict(query_embedding) query_terms = [] indices = [] if num_words is None: # in this case we use the threshold parameter decoded = normalize_to_01_range(decoded) # normalize to [0,1] range _, indices = np.where(decoded > threshold) elif num_words: # otherwise we use this guy indices = decoded[0].argsort()[-num_words:][::1] if emode == "index": # construct bin vector bin_code_vec = [0] * len(decoded[0]) for idx in indices: bin_code_vec[idx] = 1 # construct int vector with indices indices = DECODE(bin_code_vec) # reverse indices into words for index in indices: if index == 0: # reserved for empty buckets continue term = inv_vocab[index] query_terms.append(term) reconstructed_query = " ".join(query_terms) return decoded, reconstructed_query
def test_index_based_coding(): vocab = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'] vocab_index = dict() inv_vocab_index = dict() i = 1 for l in vocab: spaced_id = i * 10 vocab_index[l] = spaced_id inv_vocab_index[spaced_id] = l i += 1 print("Vocab size: " + str(len(vocab))) code_dim_int = 8 code_dim = code_dim_int * 32 words = [] for combination in itertools.combinations(vocab, 3): word = ' '.join(combination) words.append(word) print("queries from vocab: " + str(len(words))) float_embedding_factor = 1 training_data = [] for w in words: code_vector = np.asarray([0] * code_dim_int, dtype=np.int32) tokens = w.split(' ') # obtain list of candidate indices for each token for t in tokens: #idx = hash_this(t) % code_dim_int # hashing only once this time indices = get_hash_indices(t, code_dim_int) #print(str(indices)) for idx in indices: if code_vector[idx] == 0: code_vector[idx] = vocab_index[t] #/ float_embedding_factor continue set_tokens = set() for t in tokens: set_tokens.add(t) bin_code_vector = CODE(code_vector) training_data.append((set_tokens, bin_code_vector)) hits = 0 for tokens, vec in training_data: vec = DECODE(vec) reconstructed_word = set() ids = set() for el in vec: if el != 0: ids.add(el) for id in ids: # id = round(id * float_embedding_factor) word = inv_vocab_index[id] reconstructed_word.add(word) if tokens != reconstructed_word: print(str(tokens)) print(str(reconstructed_word)) else: hits += 1 ratio_hit = float(hits/len(training_data)) print("Ratio hit before learning: " + str(ratio_hit)) #exit() from architectures import autoencoder as ae model = ae.declare_model(code_dim, 8) # 16d and 1000e // 8d 3000e model = ae.compile_model(model) # reshape training data training_data = [t for _, t in training_data] # X = np.asarray([training_data[0]]) X = [training_data[0]] # print(X) for t in training_data[1:]: # npt = np.asarray([t]) X.append(t) # X = np.concatenate((npt, X)) # X[0].append(np.asarray(t)) model = ae.train_model(model, np.asarray(X), epochs=3000, batch_size=2) encoder = ae.encoder decoder = ae.decoder def get_query_from_code(code): words = set() int_code = DECODE(code) for el in int_code: reconstructed_idx = el if reconstructed_idx != 0: if reconstructed_idx in inv_vocab_index: word = inv_vocab_index[reconstructed_idx] words.add(word) return words totals = len(training_data) hits = 0 hits_top2 = 0 for t in training_data: input = np.asarray([t]) number_of_ones = len(np.where(input[0] == 1)[0]) original = get_query_from_code(input[0]) #original_hot = input.argmax() encoded = encoder.predict(input) #print(str(encoded)) decoded = decoder.predict(encoded) indices = decoded[0].argsort()[-number_of_ones:][::1] decoded_bin = [0] * code_dim_int * 32 for idx in indices: decoded_bin[idx] = 1 # avg = np.average(decoded[0]) # decoded_bin = [] # for el in decoded[0]: # if el > avg: # decoded_bin.append(1) # else: # decoded_bin.append(0) # print("O-decoded: " + str(decoded[0])) recon = get_query_from_code(decoded_bin) if original == recon: hits += 1 else: print("I:" + str(original)) print("O:" + str(recon)) #output = decoded[0].argsort() #output = output[-2:][::-1] #print(str(original_hot) + " -- " + str(output[0])) #print(str(original_hot) + " -- " + str(output)) #if original_hot == output[0]: # hits += 1 #if original_hot in output: # hits_top2 += 1 ratio = hits / totals ratio_top = hits_top2 / totals print("HITS: " + str(ratio)) print("HITS-top2: " + str(ratio_top))
def test_random_mapping(): import random # learning to sum training_data = [] for i in range(1000): x1 = [random.randint(0, 50)] x2 = [random.randint(0, 50)] y = [random.randint(0, 50)] training_data.append((x1, x2, y)) print(str(x1) + " + " + str(x2) + " = " + str(y)) bin_training_data = [] for x1, x2, y in training_data: x1_bin = CODE(x1) x2_bin = CODE(x2) y_bin = CODE(y) bin_training_data.append((x1_bin, x2_bin, y_bin)) print(str(x1_bin) + " + " + str(x2_bin) + " = " + str(y_bin)) model = declare_model(32) model = compile_model(model) def generator_of_data(training_data, batch_size): while 1: batch_x1 = [] batch_x2 = [] batch_y = [] for i in range(len(training_data)): if i == len(training_data): continue x1, x2, y = training_data[i] batch_x1.append(x1) batch_x2.append(x2) batch_y.append(y) if len(batch_x1) == batch_size: yield [np.asarray(batch_x1), np.asarray(batch_x2)], np.asarray(batch_y) batch_x1.clear() batch_x2.clear() batch_y.clear() import time stime = time.time() model = train_model_incremental(model, generator_of_data(bin_training_data, 2), steps_per_epoch=500, epochs=100) etime = time.time() hits = 0 for x1, x2, y in bin_training_data: x1 = np.asarray([x1]) x2 = np.asarray([x2]) output = model.predict([x1, x2]) threshold = 0.4 normalized_output = normalize_to_01_range(output) ones = np.where(normalized_output > threshold)[1] bin_code = [0] * 32 for one in ones: bin_code[one] = 1 gt = DECODE(y) result = DECODE(bin_code) if gt == result: hits += 1 else: print(normalized_output) print(str(gt) + " -> " + str(result)) print("Hit ratio-training: " + str(float(hits / len(bin_training_data)))) print("Total training time: " + str(etime - stime))
def test_learn_to_sum(): import random "learning to sum" training_data = [] training_data_idx = [] for i in range(1000): rnd = random.randint(0, 999) training_data_idx.append(rnd) x1 = [rnd] x2 = [rnd + 3] y = [2 * rnd + 3] # addition training_data.append((x1, x2, y)) print(str(x1) + " + " + str(x2) + " = " + str(y)) bin_training_data = [] for x1, x2, y in training_data: x1_bin = CODE(x1) x2_bin = CODE(x2) y_bin = CODE(y) bin_training_data.append((x1_bin, x2_bin, y_bin)) print(str(x1_bin) + " + " + str(x2_bin) + " = " + str(y_bin)) model = declare_model(32) model = compile_model(model) def generator_of_data(training_data, batch_size): while 1: batch_x1 = [] batch_x2 = [] batch_y = [] for i in range(len(training_data)): if i == len(training_data): continue x1, x2, y = training_data[i] batch_x1.append(x1) batch_x2.append(x2) batch_y.append(y) if len(batch_x1) == batch_size: yield [np.asarray(batch_x1), np.asarray(batch_x2)], np.asarray(batch_y) batch_x1.clear() batch_x2.clear() batch_y.clear() trained_model = train_model_incremental(model, generator_of_data( bin_training_data, 4), steps_per_epoch=250, epochs=100) x1_test = [] x2_test = [] y_test = [] for x1, x2, y in bin_training_data: x1_test.append(x1) x2_test.append(x2) y_test.append(y) score = evaluate_model(trained_model, np.asarray(x1_test), np.asarray(x2_test), np.asarray(y_test)) print("Final score: " + str(score)) hits = 0 for x1, x2, y in bin_training_data: x1 = np.asarray([x1]) x2 = np.asarray([x2]) total_ones_input = len(np.where(x1 == 0)[0]) + len( np.where(x2 == 1)[0]) output = model.predict([x1, x2]) # avg = np.average(output) # std = np.std(output) # threshold = avg + std threshold = 0.4 normalized_output = normalize_to_01_range(output) # threshold = np.percentile(output, 90) ones = np.where(normalized_output > threshold)[1] # ones = output[0].argsort()[-total_ones_input:][::1] bin_code = [0] * 32 for one in ones: bin_code[one] = 1 gt = DECODE(y) result = DECODE(bin_code) if gt == result: hits += 1 else: print(normalized_output) print(str(gt) + " -> " + str(result)) print("Hit ratio-training: " + str(float(hits / len(bin_training_data)))) hits = 0 test_samples = 0 for i in range(1000): if i not in training_data_idx: test_samples += 1 x1 = [i] x2 = [i + 3] y = [2 * i + 3] x1_bin = CODE(x1) x2_bin = CODE(x2) y_bin = CODE(y) output = model.predict( [np.asarray([x1_bin]), np.asarray([x2_bin])]) threshold = 0.4 normalized_output = normalize_to_01_range(output) ones = np.where(normalized_output > threshold)[1] bin_code = [0] * 32 for one in ones: bin_code[one] = 1 gt = DECODE(y_bin) result = DECODE(bin_code) if gt == result: hits += 1 else: print(normalized_output) print(str(gt) + " -> " + str(result)) print("Hit ratio-test: " + str(float(hits / test_samples)))