Esempio n. 1
0
    def get_model(self):
        self.word2idx_map = gutils.load_data_pkl(cnt.WORD2IDX_FILE)
        self.vocab_size = len(self.word2idx_map)

        num_train = len(gutils.load_data_pkl(cnt.TRAIN_DATA_PAIRS_FILE))
        num_test = len(gutils.load_data_pkl(cnt.TEST_DATA_PAIRS_FILE))
        num_validation = len(
            gutils.load_data_pkl(cnt.VALIDATION_DATA_PAIRS_FILE))

        siamese_net = SiameseNet(vocab_size=self.vocab_size,
                                 training_samples=num_train,
                                 validation_samples=num_validation,
                                 testing_samples=num_test,
                                 use_generator=True)
        self.model = siamese_net
Esempio n. 2
0
def get_data_as_vanilla(num_samples, prefix='train'):
    try:
        sent_tokens_file = tables.open_file(os.path.join(cnt.DATA_FOLDER, cnt.SENT_TOKENS_FILE), mode='r')
        sent_tokens = sent_tokens_file.root.data
        
        random.seed(42)
        
        data_pairs = gutils.load_data_pkl(prefix + "_data_pairs.pkl")
        random.shuffle(data_pairs)

        items1, items2, labels = zip(*data_pairs)
        items1, items2, labels = np.array(items1), np.array(items2), np.array(labels)

        n = min(num_samples, len(data_pairs))

        start, end = 0, n
        
        tokens1 = [sent_tokens[i] for i in items1[start:end]]
        tokens2 = [sent_tokens[i] for i in items2[start:end]]

        sent_data_1 = gutils.get_wv_siamese(wv_model, tokens1)
        sent_data_2 = gutils.get_wv_siamese(wv_model, tokens2)

        return [sent_data_1, sent_data_2], labels[start:end]
    
    finally:
        sent_tokens_file.close()
Esempio n. 3
0
def get_data_as_generator(num_samples, prefix='train'):
    try:
        sent_tokens_file = tables.open_file(os.path.join(cnt.DATA_FOLDER, cnt.SENT_TOKENS_FILE), mode='r')
        sent_tokens = sent_tokens_file.root.data
        
        random.seed(42)

        data_pairs = gutils.load_data_pkl(prefix + "_data_pairs.pkl")
        random.shuffle(data_pairs)

        items1, items2, labels = zip(*data_pairs)
        items1, items2, labels = np.array(items1), np.array(items2), np.array(labels)

        n = len(data_pairs)
        num_batches = int(math.ceil(float(n)/cnt.SIAMESE_BATCH_SIZE))

        batch_num = 0

        while True:
            m = batch_num % num_batches

            start, end = m*cnt.SIAMESE_BATCH_SIZE, min((m+1)*cnt.SIAMESE_BATCH_SIZE, n)
            
            tokens1 = [sent_tokens[i] for i in items1[start:end]]
            tokens2 = [sent_tokens[i] for i in items2[start:end]]
            
            sent_data_1 = gutils.get_wv_siamese(wv_model, tokens1)
            sent_data_2 = gutils.get_wv_siamese(wv_model, tokens2)
            
            batch_num += 1

            yield [sent_data_1, sent_data_2], labels[start:end]
            
    finally:
        sent_tokens_file.close()
Esempio n. 4
0
import siamese_api
from siamese_api import SiameseAPI
import data_generator as dg
import pandas as pd, random, pickle, numpy as np, tables, sys, time, collections, math, os
from sklearn.neighbors import KDTree
import grouping_utils as gutils
import constants as cnt
from sklearn.metrics.pairwise import euclidean_distances

try:
    validation_set = pd.read_csv("data/grouping_validation_set.csv")

    items = gutils.load_data_pkl('items_13.pkl')
    item_id_index_map = dict()
    for i in range(len(items)):
        item_id_index_map[items[i][0]] = i
        
    pt_indices = collections.defaultdict(list)
    for i in range(len(items)):
        pt = items[i][1]
        pt_indices[pt].append(i)

    base_item_ids = list(validation_set['Item Id'])
    grouped_item_ids = list(validation_set['Items to be grouped'])
    
    print(len(set(base_item_ids)))
    print(len(set(grouped_item_ids)))
    
    print(len(set(base_item_ids).intersection(set([x for x, y in item_id_index_map.items()]))))
    print(len(set(grouped_item_ids).intersection(set([x for x, y in item_id_index_map.items()]))))
Esempio n. 5
0
# print("Constructing KD-Tree...")
# vectors = sapi.fetch_embeddings_pytables()
# gutils.construct_kd_tree(vectors, save_file=cnt.SIAMESE_KD_TREE_FILE)

# print("Benchmarking suggested group API...")
# print(gutils.benchmark_kdtree(num_samples=1000))

# print("Getting all items...")
# items_4200 = gutils.load_data_pkl('items.pkl')
# items_13 = gutils.load_data_pkl('items_13.pkl')

# all_items = items_4200 + items_13
# gutils.save_data_pkl(all_items, 'all_items.pkl')

print("Computing embeddings...")
items_13 = gutils.load_data_pkl('items_13.pkl')
print(len(items_13))

# try:
#     sapi = SiameseAPI()

#     sapi.get_model()
#     sapi.model.init_model()
#     sapi.model.load()

#     embeds_file = tables.open_file(os.path.join(cnt.DATA_FOLDER, 'embeddings_13.h5'), mode='w')
#     atom = tables.Float32Atom()
#     embeds_arr = embeds_file.create_earray(embeds_file.root, 'data', atom, (0, cnt.SIAMESE_EMBEDDING_SIZE))

#     n, batch_size = len(items_13), cnt.PYTABLES_INSERT_BATCH_SIZE
#     num_batches = int(math.ceil(float(n)/batch_size))
Esempio n. 6
0
import grouping_utils as gutils
import tables, collections, os
import numpy as np, math, random
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import euclidean_distances
from multiprocessing.dummy import Pool as ThreadPool
from multiprocessing import Pool
from gensim.models import Word2Vec
import constants as cnt

if os.path.exists(os.path.join(cnt.DATA_FOLDER, cnt.ITEMS_FILE)):
    items = gutils.load_data_pkl(cnt.ITEMS_FILE)
    groups = gutils.abstract_groups(items)

if os.path.exists(os.path.join(cnt.DATA_FOLDER, cnt.GROUP_INDICES_FILE)):
    group_indices = gutils.load_data_pkl(cnt.GROUP_INDICES_FILE)
    
if os.path.exists(os.path.join(cnt.DATA_FOLDER, cnt.WV_KD_TREE_FILE)):
    kdtree = gutils.load_data_pkl(cnt.WV_KD_TREE_FILE)
    
if os.path.exists(os.path.join(cnt.DATA_FOLDER, cnt.WV_MODEL_FILE)):
    wv_model = Word2Vec.load(os.path.join(cnt.DATA_FOLDER, cnt.WV_MODEL_FILE))

def create_sent_tokens_array():
    try:
        tokens_file = tables.open_file(os.path.join(cnt.DATA_FOLDER, cnt.SENT_TOKENS_FILE), mode='w')
        atom = tables.StringAtom(itemsize=16)
        tokens_arr = tokens_file.create_earray(tokens_file.root, 'data', atom, (0, cnt.MAX_WORDS))
        vocab = set()
        
        n, batch_size = len(items), cnt.PYTABLES_INSERT_BATCH_SIZE
Esempio n. 7
0
import siamese_api
from siamese_api import SiameseAPI
import data_generator as dg
import pandas as pd, random, pickle, numpy as np, tables, sys, time, collections, math, os
from sklearn.neighbors import KDTree
import grouping_utils as gutils
import constants as cnt

try:
    validation_set = pd.read_csv("data/grouping_validation_set.csv")
    gs_items = pd.read_csv("data/items_gs.csv")

    items = gutils.load_data_pkl('items.pkl')
    item_id_index_map = dict()
    for i in range(len(items)):
        item_id_index_map[items[i][0]] = i

    base_item_ids = list(validation_set['Item Id'])
    grouped_item_ids = list(validation_set['Items to be grouped'])

    gs_mapping = collections.defaultdict(list)
    for i in range(len(base_item_ids)):
        if grouped_item_ids[i] in item_id_index_map:
            gs_mapping[base_item_ids[i]].append(grouped_item_ids[i])

    sapi = SiameseAPI()
    sapi.get_model()
    sapi.model.init_model()
    sapi.model.load()

    kdtree = gutils.load_data_pkl('kd_tree.pkl')