def get_model(self): self.word2idx_map = gutils.load_data_pkl(cnt.WORD2IDX_FILE) self.vocab_size = len(self.word2idx_map) num_train = len(gutils.load_data_pkl(cnt.TRAIN_DATA_PAIRS_FILE)) num_test = len(gutils.load_data_pkl(cnt.TEST_DATA_PAIRS_FILE)) num_validation = len( gutils.load_data_pkl(cnt.VALIDATION_DATA_PAIRS_FILE)) siamese_net = SiameseNet(vocab_size=self.vocab_size, training_samples=num_train, validation_samples=num_validation, testing_samples=num_test, use_generator=True) self.model = siamese_net
def get_data_as_vanilla(num_samples, prefix='train'): try: sent_tokens_file = tables.open_file(os.path.join(cnt.DATA_FOLDER, cnt.SENT_TOKENS_FILE), mode='r') sent_tokens = sent_tokens_file.root.data random.seed(42) data_pairs = gutils.load_data_pkl(prefix + "_data_pairs.pkl") random.shuffle(data_pairs) items1, items2, labels = zip(*data_pairs) items1, items2, labels = np.array(items1), np.array(items2), np.array(labels) n = min(num_samples, len(data_pairs)) start, end = 0, n tokens1 = [sent_tokens[i] for i in items1[start:end]] tokens2 = [sent_tokens[i] for i in items2[start:end]] sent_data_1 = gutils.get_wv_siamese(wv_model, tokens1) sent_data_2 = gutils.get_wv_siamese(wv_model, tokens2) return [sent_data_1, sent_data_2], labels[start:end] finally: sent_tokens_file.close()
def get_data_as_generator(num_samples, prefix='train'): try: sent_tokens_file = tables.open_file(os.path.join(cnt.DATA_FOLDER, cnt.SENT_TOKENS_FILE), mode='r') sent_tokens = sent_tokens_file.root.data random.seed(42) data_pairs = gutils.load_data_pkl(prefix + "_data_pairs.pkl") random.shuffle(data_pairs) items1, items2, labels = zip(*data_pairs) items1, items2, labels = np.array(items1), np.array(items2), np.array(labels) n = len(data_pairs) num_batches = int(math.ceil(float(n)/cnt.SIAMESE_BATCH_SIZE)) batch_num = 0 while True: m = batch_num % num_batches start, end = m*cnt.SIAMESE_BATCH_SIZE, min((m+1)*cnt.SIAMESE_BATCH_SIZE, n) tokens1 = [sent_tokens[i] for i in items1[start:end]] tokens2 = [sent_tokens[i] for i in items2[start:end]] sent_data_1 = gutils.get_wv_siamese(wv_model, tokens1) sent_data_2 = gutils.get_wv_siamese(wv_model, tokens2) batch_num += 1 yield [sent_data_1, sent_data_2], labels[start:end] finally: sent_tokens_file.close()
import siamese_api from siamese_api import SiameseAPI import data_generator as dg import pandas as pd, random, pickle, numpy as np, tables, sys, time, collections, math, os from sklearn.neighbors import KDTree import grouping_utils as gutils import constants as cnt from sklearn.metrics.pairwise import euclidean_distances try: validation_set = pd.read_csv("data/grouping_validation_set.csv") items = gutils.load_data_pkl('items_13.pkl') item_id_index_map = dict() for i in range(len(items)): item_id_index_map[items[i][0]] = i pt_indices = collections.defaultdict(list) for i in range(len(items)): pt = items[i][1] pt_indices[pt].append(i) base_item_ids = list(validation_set['Item Id']) grouped_item_ids = list(validation_set['Items to be grouped']) print(len(set(base_item_ids))) print(len(set(grouped_item_ids))) print(len(set(base_item_ids).intersection(set([x for x, y in item_id_index_map.items()])))) print(len(set(grouped_item_ids).intersection(set([x for x, y in item_id_index_map.items()]))))
# print("Constructing KD-Tree...") # vectors = sapi.fetch_embeddings_pytables() # gutils.construct_kd_tree(vectors, save_file=cnt.SIAMESE_KD_TREE_FILE) # print("Benchmarking suggested group API...") # print(gutils.benchmark_kdtree(num_samples=1000)) # print("Getting all items...") # items_4200 = gutils.load_data_pkl('items.pkl') # items_13 = gutils.load_data_pkl('items_13.pkl') # all_items = items_4200 + items_13 # gutils.save_data_pkl(all_items, 'all_items.pkl') print("Computing embeddings...") items_13 = gutils.load_data_pkl('items_13.pkl') print(len(items_13)) # try: # sapi = SiameseAPI() # sapi.get_model() # sapi.model.init_model() # sapi.model.load() # embeds_file = tables.open_file(os.path.join(cnt.DATA_FOLDER, 'embeddings_13.h5'), mode='w') # atom = tables.Float32Atom() # embeds_arr = embeds_file.create_earray(embeds_file.root, 'data', atom, (0, cnt.SIAMESE_EMBEDDING_SIZE)) # n, batch_size = len(items_13), cnt.PYTABLES_INSERT_BATCH_SIZE # num_batches = int(math.ceil(float(n)/batch_size))
import grouping_utils as gutils import tables, collections, os import numpy as np, math, random from sklearn.model_selection import train_test_split from sklearn.metrics.pairwise import euclidean_distances from multiprocessing.dummy import Pool as ThreadPool from multiprocessing import Pool from gensim.models import Word2Vec import constants as cnt if os.path.exists(os.path.join(cnt.DATA_FOLDER, cnt.ITEMS_FILE)): items = gutils.load_data_pkl(cnt.ITEMS_FILE) groups = gutils.abstract_groups(items) if os.path.exists(os.path.join(cnt.DATA_FOLDER, cnt.GROUP_INDICES_FILE)): group_indices = gutils.load_data_pkl(cnt.GROUP_INDICES_FILE) if os.path.exists(os.path.join(cnt.DATA_FOLDER, cnt.WV_KD_TREE_FILE)): kdtree = gutils.load_data_pkl(cnt.WV_KD_TREE_FILE) if os.path.exists(os.path.join(cnt.DATA_FOLDER, cnt.WV_MODEL_FILE)): wv_model = Word2Vec.load(os.path.join(cnt.DATA_FOLDER, cnt.WV_MODEL_FILE)) def create_sent_tokens_array(): try: tokens_file = tables.open_file(os.path.join(cnt.DATA_FOLDER, cnt.SENT_TOKENS_FILE), mode='w') atom = tables.StringAtom(itemsize=16) tokens_arr = tokens_file.create_earray(tokens_file.root, 'data', atom, (0, cnt.MAX_WORDS)) vocab = set() n, batch_size = len(items), cnt.PYTABLES_INSERT_BATCH_SIZE
import siamese_api from siamese_api import SiameseAPI import data_generator as dg import pandas as pd, random, pickle, numpy as np, tables, sys, time, collections, math, os from sklearn.neighbors import KDTree import grouping_utils as gutils import constants as cnt try: validation_set = pd.read_csv("data/grouping_validation_set.csv") gs_items = pd.read_csv("data/items_gs.csv") items = gutils.load_data_pkl('items.pkl') item_id_index_map = dict() for i in range(len(items)): item_id_index_map[items[i][0]] = i base_item_ids = list(validation_set['Item Id']) grouped_item_ids = list(validation_set['Items to be grouped']) gs_mapping = collections.defaultdict(list) for i in range(len(base_item_ids)): if grouped_item_ids[i] in item_id_index_map: gs_mapping[base_item_ids[i]].append(grouped_item_ids[i]) sapi = SiameseAPI() sapi.get_model() sapi.model.init_model() sapi.model.load() kdtree = gutils.load_data_pkl('kd_tree.pkl')