def text_experiment(config): np.random.seed(seed=27) dataset = 'wikipedia' data_path = '../data/' + dataset + '/' trfile = data_path + '/splits/train%d.pkl' % config['fold'] trainset = pickle.load(open(trfile, 'rb')) print('Overall %d reviews' % len(trainset)) # Step 2: Fit a emb model print(config_to_name(config)) if config['cont_train']: dummy_config = config.copy() dummy_config['cont_train'] = False dummy_config['use_sideinfo'] = False dummy_config['exposure'] = False dummy_config['max_iter'] = 300000 mfile = config_to_name(dummy_config) + '.pkl' print(mfile) train_noside = pickle.load(open(data_path + 'splits/' + mfile, "rb")) init_model = train_noside['model'] else: init_model = None tag_dict = pickle.load(open(data_path + 'simple_tag_dict.pkl', 'rb')) tags = [''] * 12 for tag in tag_dict.keys(): tags[tag_dict[tag]] = tag voc_dict = pickle.load(open(data_path + 'voc_dict.pkl', 'rb')) dictionary = voc_dict['dic'] reverse_dictionary = voc_dict['rev_dic'] sample_doc = trainset[16][0:20, :] print('Sample data', sample_doc[:, 0], [reverse_dictionary[i] for i in sample_doc[:, 0]]) print('Prior tags', sample_doc[:, 1], [tags[i] for i in sample_doc[:, 1]]) emb_model, logg = fit_emb(trainset, config, init_model) print('Training done!') tsfile = data_path + '/splits/test%d.pkl' % config['fold'] testset = pickle.load(open(tsfile, 'rb')) print('Overall %d reviews' % len(testset)) loss_array, pos_loss_array = evaluate_emb(testset, emb_model, config) # Step 4: Save result and Visualize the embeddings. mfile = config_to_name(config) + '.pkl' pickle.dump( dict(model=emb_model, logg=logg, loss_array=loss_array, pos_loss_array=pos_loss_array), open(data_path + 'splits/' + mfile, "wb"))
def restaurant_experiment(config): np.random.seed(seed=27) dataset = 'restaurant' data_path = '/rigel/dsi/users/ll3105/sa-data/' + dataset + '/' trfile = data_path + '/splits/train0.pkl' trainset = pickle.load(open(trfile, 'rb')) print('Overall %d reviews' % len(trainset)) voc_dict = pickle.load(open(data_path + 'voc_dict.pkl', 'rb')) dictionary = voc_dict['dic'] reverse_dictionary = voc_dict['rev_dic'] print('Sample data', trainset[0]['text'][:10], [reverse_dictionary[i] for i in trainset[0]['text'][:10]]) # Step 2: Function to generate a training batch for the skip-gram model. sidevec, batch, labels = generate_batch(trainset[0], config=config) #print(trainset[0]) for i in range(8): msg = '' for j in xrange(2 * config['half_window']): msg = msg + str( batch[i, j]) + ',' + reverse_dictionary[batch[i, j]] + ';' print(msg, ' *', sidevec[0:3], '->', labels[i], reverse_dictionary[labels[i]]) # Step 3: Fit a emb model print(config_to_name(config)) if config['cont_train']: dummy_config = config.copy() dummy_config['cont_train'] = False dummy_config['use_sideinfo'] = False dummy_config['exposure'] = False dummy_config['max_iter'] = 300000 mfile = config_to_name(dummy_config) + '.pkl' print(mfile) train_noside = pickle.load(open(data_path + 'splits/' + mfile, "rb")) init_model = train_noside['model'] else: init_model = None emb_model, logg = fit_emb(trainset, config, voc_dict, init_model) # Step 4: Save result and Visualize the embeddings. mfile = config_to_name(config) + '.pkl' pickle.dump(dict(model=emb_model, logg=logg), open(data_path + 'splits/' + mfile, "wb")) print('Training done!')
def movie_experiment(config): np.random.seed(seed=27) dataset = 'movie' data_path = '/rigel/dsi/users/ll3105/sa-data/' + dataset + '/' trfile = data_path + '/splits/train%d.pkl' % config['fold'] trainset = pickle.load(open(trfile, 'rb')) print('Overall %d reviews' % len(trainset)) # Step 2: Fit a emb model print(config_to_name(config)) if config['cont_train']: dummy_config = config.copy() dummy_config['cont_train'] = False dummy_config['use_sideinfo'] = False dummy_config['exposure'] = False dummy_config['max_iter'] = 300000 mfile = config_to_name(dummy_config) + '.pkl' print(mfile) train_noside = pickle.load(open(data_path + 'splits/' + mfile, "rb")) init_model = train_noside['model'] else: init_model = None emb_model, logg = fit_emb(trainset, config, init_model) tsfile = data_path + '/splits/test%d.pkl' % config['fold'] testset = pickle.load(open(tsfile, 'rb')) print('Overall %d reviews' % len(testset)) loss_array, pos_loss_array = evaluate_emb(testset, emb_model, config) # Step 4: Save result and Visualize the embeddings. mfile = config_to_name(config) + '.pkl' pickle.dump(dict(model=emb_model, logg=logg, loss_array=loss_array, pos_loss_array=pos_loss_array), open(data_path + 'splits/' + mfile, "wb")) print('Training done!')
# load dictionary voc_dict = pickle.load(open(data_path + 'voc_dict.pkl', 'rb')) dictionary = voc_dict['dic'] reverse_dictionary = voc_dict['rev_dic'] vocabulary_size = len(dictionary) print('Vocabulary size is %d' % vocabulary_size) # load data tsfile = data_path + '/splits/test0.pkl' testset = pickle.load(open(tsfile, 'rb')) print('Overall %d reviews' % len(testset)) # load model config = dict(use_sideinfo=True, K=128, max_iter=200000, context_type='skip_gram', skip_window=1, num_skips=2, voc_size=len(dictionary), reg=1.0) mfile = config_to_name(config) + '.pkl' emb_model = pickle.load(open(data_path + 'splits/' + mfile, "rb")) # batch_size becomes a variable num_sampled = 64 # Number of negative examples to sample. reviews = testset num_review = len(reviews) print('Sample data', reviews[0]['text'][:10], [reverse_dictionary[i] for i in reviews[0]['text'][:10]]) # Step 3: Function to generate a training batch for the skip-gram model. graph = tf.Graph() with graph.as_default(): # Input data.
def load_model(config, path): mfile = config_to_name(config) + '.pkl' loader = pickle.load(open(path + mfile, "rb")) model = loader['model'] return model['alpha'], model['rho'], model['weight'], model['invmu']
reg_weight=10, exposure=False, cont_train=False, sample_ratio=0.1) config1 = config0.copy() config1.update({'exposure': True}) config2 = config1.copy() config2.update({'use_sideinfo': True}) data_path = '/rigel/dsi/users/ll3105/sa-data/' + dataset + '/' reviews = pickle.load(open(data_path + 'splits/test%d.pkl' % fold, 'rb')) losses = [] for conf in [config0, config1, config2]: mfile = config_to_name(conf) #loss_array = pickle.load(open(data_path + 'splits/test/loss_' + mfile + '.pkl', "rb")) loader = pickle.load(open(data_path + 'splits/' + mfile + '.pkl', "rb")) loss_array = loader['loss_array'] #rlosses = np.array([np.mean(rloss) for rloss in loss_array]) for ir in xrange(len(loss_array)): _, ind, rate = sparse.find(reviews['scores'][ir, :]) loss_array[ir] = loss_array[ir][ind] rlosses = np.concatenate(loss_array) losses.append(rlosses) print('\\multirow{2}{*}{K = %d}' % K, end='')