def text_experiment(config):

    np.random.seed(seed=27)

    dataset = 'wikipedia'
    data_path = '../data/' + dataset + '/'

    trfile = data_path + '/splits/train%d.pkl' % config['fold']
    trainset = pickle.load(open(trfile, 'rb'))
    print('Overall %d reviews' % len(trainset))

    # Step 2: Fit a emb model
    print(config_to_name(config))
    if config['cont_train']:
        dummy_config = config.copy()
        dummy_config['cont_train'] = False
        dummy_config['use_sideinfo'] = False
        dummy_config['exposure'] = False
        dummy_config['max_iter'] = 300000
        mfile = config_to_name(dummy_config) + '.pkl'
        print(mfile)
        train_noside = pickle.load(open(data_path + 'splits/' + mfile, "rb"))
        init_model = train_noside['model']
    else:
        init_model = None

    tag_dict = pickle.load(open(data_path + 'simple_tag_dict.pkl', 'rb'))
    tags = [''] * 12
    for tag in tag_dict.keys():
        tags[tag_dict[tag]] = tag

    voc_dict = pickle.load(open(data_path + 'voc_dict.pkl', 'rb'))
    dictionary = voc_dict['dic']
    reverse_dictionary = voc_dict['rev_dic']
    sample_doc = trainset[16][0:20, :]
    print('Sample data', sample_doc[:, 0],
          [reverse_dictionary[i] for i in sample_doc[:, 0]])
    print('Prior tags', sample_doc[:, 1], [tags[i] for i in sample_doc[:, 1]])

    emb_model, logg = fit_emb(trainset, config, init_model)
    print('Training done!')

    tsfile = data_path + '/splits/test%d.pkl' % config['fold']
    testset = pickle.load(open(tsfile, 'rb'))
    print('Overall %d reviews' % len(testset))

    loss_array, pos_loss_array = evaluate_emb(testset, emb_model, config)
    # Step 4: Save result and Visualize the embeddings.
    mfile = config_to_name(config) + '.pkl'
    pickle.dump(
        dict(model=emb_model,
             logg=logg,
             loss_array=loss_array,
             pos_loss_array=pos_loss_array),
        open(data_path + 'splits/' + mfile, "wb"))
def restaurant_experiment(config):

    np.random.seed(seed=27)

    dataset = 'restaurant'
    data_path = '/rigel/dsi/users/ll3105/sa-data/' + dataset + '/'

    trfile = data_path + '/splits/train0.pkl'
    trainset = pickle.load(open(trfile, 'rb'))
    print('Overall %d reviews' % len(trainset))

    voc_dict = pickle.load(open(data_path + 'voc_dict.pkl', 'rb'))
    dictionary = voc_dict['dic']
    reverse_dictionary = voc_dict['rev_dic']
    print('Sample data', trainset[0]['text'][:10],
          [reverse_dictionary[i] for i in trainset[0]['text'][:10]])

    # Step 2: Function to generate a training batch for the skip-gram model.
    sidevec, batch, labels = generate_batch(trainset[0], config=config)
    #print(trainset[0])
    for i in range(8):
        msg = ''
        for j in xrange(2 * config['half_window']):
            msg = msg + str(
                batch[i, j]) + ',' + reverse_dictionary[batch[i, j]] + ';'
        print(msg, ' *', sidevec[0:3], '->', labels[i],
              reverse_dictionary[labels[i]])

    # Step 3: Fit a emb model
    print(config_to_name(config))
    if config['cont_train']:
        dummy_config = config.copy()
        dummy_config['cont_train'] = False
        dummy_config['use_sideinfo'] = False
        dummy_config['exposure'] = False
        dummy_config['max_iter'] = 300000
        mfile = config_to_name(dummy_config) + '.pkl'
        print(mfile)
        train_noside = pickle.load(open(data_path + 'splits/' + mfile, "rb"))
        init_model = train_noside['model']
    else:
        init_model = None

    emb_model, logg = fit_emb(trainset, config, voc_dict, init_model)

    # Step 4: Save result and Visualize the embeddings.
    mfile = config_to_name(config) + '.pkl'
    pickle.dump(dict(model=emb_model, logg=logg),
                open(data_path + 'splits/' + mfile, "wb"))

    print('Training done!')
def movie_experiment(config):

    np.random.seed(seed=27)
    
    dataset = 'movie'
    data_path = '/rigel/dsi/users/ll3105/sa-data/' + dataset + '/'
    
    trfile = data_path + '/splits/train%d.pkl' % config['fold']
    trainset = pickle.load(open(trfile, 'rb'))
    print('Overall %d reviews' % len(trainset))
    
    # Step 2: Fit a emb model
    print(config_to_name(config))
    if config['cont_train']:
        dummy_config = config.copy()
        dummy_config['cont_train'] = False 
        dummy_config['use_sideinfo'] = False 
        dummy_config['exposure'] = False 
        dummy_config['max_iter'] = 300000 
        mfile = config_to_name(dummy_config) + '.pkl'
        print(mfile) 
        train_noside = pickle.load(open(data_path + 'splits/' + mfile, "rb"))
        init_model = train_noside['model']
    else:
        init_model = None
    
    emb_model, logg = fit_emb(trainset, config, init_model)
    
    tsfile = data_path + '/splits/test%d.pkl' % config['fold']
    testset = pickle.load(open(tsfile, 'rb'))
    print('Overall %d reviews' % len(testset))
    
    loss_array, pos_loss_array = evaluate_emb(testset, emb_model, config)
    # Step 4: Save result and Visualize the embeddings.
    mfile = config_to_name(config) + '.pkl'
    pickle.dump(dict(model=emb_model, logg=logg, loss_array=loss_array, pos_loss_array=pos_loss_array), open(data_path + 'splits/' + mfile, "wb"))

    print('Training done!')
# load dictionary
voc_dict = pickle.load(open(data_path + 'voc_dict.pkl', 'rb'))
dictionary = voc_dict['dic']
reverse_dictionary = voc_dict['rev_dic']
vocabulary_size = len(dictionary)
print('Vocabulary size is %d' % vocabulary_size)

# load data
tsfile = data_path + '/splits/test0.pkl'
testset = pickle.load(open(tsfile, 'rb'))
print('Overall %d reviews' % len(testset))

# load model
config = dict(use_sideinfo=True, K=128, max_iter=200000, context_type='skip_gram', skip_window=1, num_skips=2, voc_size=len(dictionary), reg=1.0)
mfile = config_to_name(config) + '.pkl'
emb_model = pickle.load(open(data_path + 'splits/' + mfile, "rb"))

# batch_size becomes a variable 
num_sampled = 64    # Number of negative examples to sample.


reviews = testset
num_review = len(reviews)
print('Sample data', reviews[0]['text'][:10], [reverse_dictionary[i] for i in reviews[0]['text'][:10]])

# Step 3: Function to generate a training batch for the skip-gram model.
graph = tf.Graph()
with graph.as_default():

  # Input data.
def load_model(config, path):
    mfile = config_to_name(config) + '.pkl'
    loader = pickle.load(open(path + mfile, "rb"))
    model = loader['model']

    return model['alpha'], model['rho'], model['weight'], model['invmu']
Exemple #6
0
                   reg_weight=10,
                   exposure=False,
                   cont_train=False,
                   sample_ratio=0.1)
    config1 = config0.copy()
    config1.update({'exposure': True})

    config2 = config1.copy()
    config2.update({'use_sideinfo': True})

    data_path = '/rigel/dsi/users/ll3105/sa-data/' + dataset + '/'
    reviews = pickle.load(open(data_path + 'splits/test%d.pkl' % fold, 'rb'))

    losses = []
    for conf in [config0, config1, config2]:
        mfile = config_to_name(conf)
        #loss_array = pickle.load(open(data_path + 'splits/test/loss_' + mfile + '.pkl', "rb"))
        loader = pickle.load(open(data_path + 'splits/' + mfile + '.pkl',
                                  "rb"))
        loss_array = loader['loss_array']
        #rlosses = np.array([np.mean(rloss) for rloss in loss_array])

        for ir in xrange(len(loss_array)):
            _, ind, rate = sparse.find(reviews['scores'][ir, :])
            loss_array[ir] = loss_array[ir][ind]

        rlosses = np.concatenate(loss_array)
        losses.append(rlosses)

    print('\\multirow{2}{*}{K = %d}' % K, end='')