Exemple #1
0
        '%(filename)-20s LINE %(lineno)-4d %(levelname)-8s %(message)s')
    # tell the handler to use this format
    console.setFormatter(formatter)
    logging.getLogger('').addHandler(console)


init_logging()

# ----------------- 1. Process the data  ---------------------------------------
data_dir = '/slfs1/users/zjz17/github/data/ptb_data/'
vocab_path = os.path.join(data_dir, 'ptb.vocab.txt')
train_path = os.path.join(data_dir, 'ptb.train.txt')
valid_path = os.path.join(data_dir, 'ptb.valid.txt')
test_path = os.path.join(data_dir, 'ptb.test.txt')

word2idx = read_dict(vocab_path)
ignore_label = word2idx.get('<pad>')

data_train, label_train = get_text_id(train_path, word2idx)
data_valid, label_valid = get_text_id(valid_path, word2idx)

# -----------------2. Params Defination ----------------------------------------
num_buckets = 1
batch_size = 32

#  network parameters
num_lstm_layer = 1

input_size = len(word2idx)
dropout = 0.0
Exemple #2
0
elif mode == 'static':
    use_word2vec = True
    fixed_embed = True
else:
    use_word2vec = True
    fixed_embed = False

word2vec_path = w2v_file

vocab_file = 'vocab.txt'

sentences, label = read_file(train_file)
sentences_padded = pad_sentences(sentences)
build_vocab(sentences_padded, vocab_file, size=None)
logging.info('total sentences lines: %d' % len(sentences_padded))
word2idx = read_dict(vocab_file)
logging.info('dict length: %d' % len(word2idx))
valid_sentences, valid_label = read_file(valid_file)
valid_sentences_padded = pad_sentences(valid_sentences)

train_data, train_label = get_text_id(sentences_padded, label, word2idx)
valid_data, valid_label = get_text_id(valid_sentences_padded, valid_label,
                                      word2idx)

print 'train data shape: ', train_data.shape
print 'example: ', train_label[0], '\t=>\t', train_data[0]
print 'valid data shape: ', valid_data.shape
print 'example: ', valid_label[0], '\t=>\t', valid_data[0]

# ---------------------- 2. Params Defination ----------------------------------------
batch_size = 50
Exemple #3
0
def see_hidden_vectos(params_dir='optimal_params',
                      params_prefix='couplet',
                      epoch=20):
    _, arg_params, __ = mx.model.load_checkpoint(
        '%s/%s' % (params_dir, params_prefix), epoch)
    results = []
    moban = [[5, 5, 5, 5, 5], [10, 10, 10, 10, 10], [15, 15, 15, 15, 15],
             [20, 20, 20, 20, 20], [25, 25, 25, 25, 25]]
    moban = [[5, 10, 15, 20, 25], [15, 15, 15, 15, 15], [10, 10, 10, 20, 25],
             [10, 20, 20, 20, 5], [13, 14, 15, 16, 17]]
    num = 50
    for i in range(len(moban)):
        results.append([])
    # parameter definition
    data_dir = '/slfs1/users/zjz17/github/data/sort'
    vocab_file = 'q3.vocab'
    enc_word2idx = read_dict(os.path.join(data_dir, vocab_file))
    dec_word2idx = read_dict(os.path.join(data_dir, vocab_file))
    num_lstm_layer = 1
    num_embed = 100
    num_hidden = 200
    num_label = len(dec_word2idx)
    batch_size = 1
    enc_input_size = len(enc_word2idx)
    dec_input_size = len(dec_word2idx)
    enc_dropout = 0.0
    dec_dropout = 0.0
    output_dropout = 0.2
    dg = DataGeneration(1000, 1, 1, 1)
    for i in range(len(moban)):
        lis = dg.generate_test_pairs(moban[i], num)
        for l in lis:
            enc_len = len(l)
            enc_data = mx.nd.array(np.array(l).reshape(1, enc_len) + 3)
            enc_mask = mx.nd.array(np.ones((enc_len, )).reshape(1, enc_len))
            beam = BeamSearch(num_lstm_layer=num_lstm_layer,
                              enc_data=enc_data,
                              enc_mask=enc_mask,
                              enc_len=enc_len,
                              enc_input_size=enc_input_size,
                              dec_input_size=dec_input_size,
                              num_hidden=num_hidden,
                              num_embed=num_embed,
                              num_label=num_label,
                              batch_size=batch_size,
                              arg_params=arg_params,
                              eos=dec_word2idx.get('<EOS>'),
                              unk=dec_word2idx.get('<UNK>'),
                              pad=dec_word2idx.get('<PAD>'),
                              ctx=mx.cpu(),
                              enc_dropout=enc_dropout,
                              dec_dropout=dec_dropout,
                              output_dropout=output_dropout)
            v = beam.init_states_dict['dec_l0_init_c'].asnumpy()
            results[i].append(v)
    ff = []
    for i in range(len(moban)):
        ff.append(results[i][0])
    for i in range(1, num):
        for j in range(len(moban)):
            ff[j] = np.concatenate((ff[j], results[j][i]))
    f = np.concatenate((ff[0], ff[1]))
    for i in range(2, len(moban)):
        f = np.concatenate((f, ff[i]))
    model = TSNE(n_components=3,
                 random_state=0,
                 learning_rate=500,
                 n_iter=2000)
    x = model.fit_transform(f)
    colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
    for i in range(len(moban)):
        tmp = range(i * 50, (i + 1) * 50)
        plt.scatter(x[tmp, 1],
                    x[tmp, 0],
                    s=20,
                    marker='o',
                    color=colors[i],
                    label='%s' % moban[i])
    ''' Three-dimension Graph
    from mpl_toolkits.mplot3d import Axes3D
    fig = plt.figure()
    ax = Axes3D(fig)
    for i in range(len(moban)):
        tmp = range(i*50, (i+1)*50)
        ax.scatter(x[tmp,0], x[tmp,1], x[tmp,2], s=20, marker = 'o', color = colors[i], label='%s' % moban[i])'''
    plt.legend(loc='upper left')
    plt.title('Encoded Hidden Vector T-SNE Visualization')
    plt.savefig('ff.jpg')
Exemple #4
0
                    default=0,
                    type=int,
                    help='which line you want to test')
parser.add_argument(
    '--mode',
    default='test',
    type=str,
    help='test one example or write examples results into a file')

args = parser.parse_args()
print args
filename = args.filename
idx = args.idx
mode = args.mode

enc_word2idx = read_dict(os.path.join(data_dir, vocab_file))
dec_word2idx = read_dict(os.path.join(data_dir, vocab_file))

print 'encoder dict length:', len(enc_word2idx)
print 'decoder dict length:', len(dec_word2idx)

enc_data, dec_data = get_enc_dec_text_id(filename, enc_word2idx, dec_word2idx)

print 'enc_data length: ', len(enc_data),
print 'example:', enc_data[0]
print 'dec_data_length: ', len(dec_data)
print 'example:', dec_data[0]

# ------------------------------- Parameter Defination -------------------------------

#  network parameters
Exemple #5
0
Model = namedtuple("Model", ['executor', 'symbol'])

logging.basicConfig(level = logging.DEBUG,
                    format = '%(asctime)s %(message)s', 
                    datefmt = '%m-%d %H:%M:%S %p',  
                    filename = 'Log',
                    filemode = 'w')
logger = logging.getLogger()
console = logging.StreamHandler()  
console.setLevel(logging.DEBUG)  
logger.addHandler(console)

DEBUG = True
# ----------------- 1. Process the data  ---------------------------------------

enc_word2idx = read_dict('../data/sort_test/vocab.txt')
dec_word2idx = read_dict('../data/sort_test/vocab.txt')
ignore_label = enc_word2idx.get('<PAD>')

if DEBUG:
    print 'read_dict length:', len(enc_word2idx)

enc_data, dec_data = get_enc_dec_text_id('../data/sort_test/tt.txt', enc_word2idx, dec_word2idx)
enc_valid, dec_valid = get_enc_dec_text_id('../data/sort_test/tt.txt', enc_word2idx, dec_word2idx)
if DEBUG:
    print 'enc_data length: ' , len(enc_data), enc_data[0:1]
    print 'dec_data_length: ' , len(dec_data), dec_data[0:1]
    print 'enc_valid_length: ', len(enc_valid), enc_valid[0:1]
    print 'dec_valid_length: ', len(dec_valid), dec_valid[0:1]

# -----------------2. Params Defination ----------------------------------------