Example #1
0
def prepare():
    if args.gpu >= 0:
        cuda.check_cuda_available()
        cuda.get_device(args.gpu).use()

    reader = SentenceReaderDir(args.intraindata, args.batchsize)
    print("")
    print("-" * 50)
    print('n_vocab: %d' %
          (len(reader.word2index) - 1))  # excluding the three special tokens
    print('corpus size: %d' % (reader.total_words))

    max_sen_len = MAX_SEN_LEN
    train_data = convert_numeric_data(reader.data, args.batchsize,
                                      reader.word2index, max_sen_len)
    test_data, freq, _ = load_xml(args.intestdata)
    test_data = convert_numeric_data(test_data, args.batchsize,
                                     reader.word2index, max_sen_len)
    return reader, train_data, test_data
    return args


args = parse_arguments()

context_word_units = args.unit
lstm_hidden_units = IN_TO_OUT_UNITS_RATIO * args.unit
target_word_units = IN_TO_OUT_UNITS_RATIO * args.unit

if args.gpu >= 0:
    cuda.check_cuda_available()
    cuda.get_device(args.gpu).use()
xp = cuda.cupy if args.gpu >= 0 else np

reader = SentenceReaderDir(args.indir, args.trimfreq, args.batchsize)
print('n_vocab: %d' %
      (len(reader.word2index) - 3))  # excluding the three special tokens
print('corpus size: %d' % (reader.total_words))

cs = [
    reader.trimmed_word2count[w] for w in range(len(reader.trimmed_word2count))
]
loss_func = L.NegativeSampling(target_word_units, cs, NEGATIVE_SAMPLING_NUM,
                               args.ns_power)

if args.context == 'lstm':
    model = BiLstmContext(args.deep, args.gpu, reader.word2index,
                          context_word_units, lstm_hidden_units,
                          target_word_units, loss_func, True, args.dropout)
else:
    return args 
    


args = parse_arguments()

context_word_units = args.unit
lstm_hidden_units = IN_TO_OUT_UNITS_RATIO*args.unit
target_word_units = IN_TO_OUT_UNITS_RATIO*args.unit

if args.gpu >= 0:
    cuda.check_cuda_available()
    cuda.get_device(args.gpu).use()
xp = cuda.cupy if args.gpu >= 0 else np
    
reader = SentenceReaderDir(args.indir, args.trimfreq, args.batchsize)
print('n_vocab: %d' % (len(reader.word2index)-3)) # excluding the three special tokens
print('corpus size: %d' % (reader.total_words))

cs = [reader.trimmed_word2count[w] for w in range(len(reader.trimmed_word2count))]
loss_func = L.NegativeSampling(target_word_units, cs, NEGATIVE_SAMPLING_NUM, args.ns_power)

if args.context == 'lstm':
    model = BiLstmContext(args.deep, args.gpu, reader.word2index, context_word_units, lstm_hidden_units, target_word_units, loss_func, True, args.dropout)
else:
    raise Exception('Unknown context type: {}'.format(args.context))

optimizer = O.Adam()
optimizer.setup(model)

STATUS_INTERVAL = 1000000