def get_data(self, set_chosen): if set_chosen == 'train': return dataIterator(self.datasets[0], self.datasets[1], self.worddicts, batch_size=self.batch_size, batch_Imagesize=self.batch_Imagesize, maxlen=self.maxlen, maxImagesize=self.maxImagesize) else: return dataIterator(self.valid_datasets[0], self.valid_datasets[1], self.worddicts, batch_size=self.batch_size, batch_Imagesize=self.batch_Imagesize, maxlen=self.maxlen, maxImagesize=self.maxImagesize)
def train(dataset='../data/traindataRnn',#path to train data dictionary='../data/traindataRnnlm.pickle',#path to rnnlm dictionary batch_size=50, max_epochs=15, finish_after=10000000, # finish after this many updates dispFreq=100, dim_word=100, # word vector dimensionality dim=1000, save_path='/path/to/save/model'): charDict = {} with open(dictionary, 'rb') as f: charDict = pkl.load(f) charDict_r = {} for kk, vv in charDict.items(): charDict_r[vv] = kk vocab_size = len(charDict) traindata = dataIterator(dataset, dictionary, batch_size) #sentencelen = 7 model = Sequential() #model.add(Embedding(vocab_size, dim_word, input_length=sentencelen)) model.add(Embedding(vocab_size, dim_word, mask_zero=True)) model.add(LSTM(output_dim=dim, return_sequences=True, activation='sigmoid')) model.add(Dropout(0.5)) model.add(LSTM(output_dim=dim, return_sequences=True, activation='sigmoid')) model.add(Dropout(0.5)) model.add(TimeDistributed(Dense(vocab_size, activation='softmax'))) model.compile(loss='categorical_crossentropy', optimizer='rmsprop') update = 0 for epochid in range(max_epochs): for x in traindata: x, y = prepare_traindata(x, vocab_size) train_loss = model.train_on_batch(x, y) update += 1 if update % dispFreq == 0: print "Epoch:\t%d\tUPdate:\t%d\tloss:\t" % (epochid, update), print(train_loss) if update >= finish_after: break print("save model!") save_name = save_path + "rnnlm_epoch%d.h5" % epochid model.save(save_name) if update >= finish_after: break
def load_data(self): datasets = ['./data/offline-train.pkl', './data/train_caption.txt'] dictionaries = ['./data/dictionary.txt'] worddicts = load_dict(dictionaries[0]) worddicts_r = [None] * len(worddicts) for kk, vv in worddicts.items(): worddicts_r[vv] = kk self.train, self.train_uid_list = dataIterator( datasets[0], datasets[1], worddicts, batch_size=self.batch_size, batch_Imagesize=self.batch_Imagesize, maxlen=self.maxlen, maxImagesize=self.maxImagesize)
def train(dim_word=100, # word vector dimensionality dim=1000, # the number of LSTM units encoder='gru', decoder='gru_cond', patience=10, # early stopping patience max_epochs=100, finish_after=10000000, # finish after this many updates dispFreq=100, decay_c=0., # L2 regularization penalty alpha_c=0., # alignment regularization clip_c=-1., # gradient clipping threshold lrate=0.01, # learning rate n_words=10000, # vocabulary size optimizer='rmsprop', batch_size=16, valid_batch_size=16, saveto='model.npz', validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq datasets='../data/traindataRnn', valid_datasets='../data/vailiddataRnn', dictionaries='../data/traindataRnn.pickle', numofs = 1, use_dropout=False, reload_=False, overwrite=False): # Model options model_options = locals().copy() # load dictionaries and invert them charDict = {} with open(dictionaries, 'rb') as f: charDict = pkl.load(f) charDict_r = {} for kk, vv in charDict.items(): charDict_r[vv] = kk # reload options if reload_ and os.path.exists(saveto): print('Reloading model options') with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) print('Loading data') train = dataIterator(datasets, dictionaries, batch_size) valid = dataIterator(valid_datasets, dictionaries, valid_batch_size) print('Building model') params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): print('Reloading model parameters') params = load_params(saveto, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask] print('Building sampler') f_init, f_next = build_sampler(tparams, model_options, trng, use_noise) # before any regularizer print('Building f_log_probs...') f_log_probs = theano.function(inps, cost, profile=profile) print('Done') cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # regularize the alpha weights if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ( (tensor.cast(y_mask.sum(0)//x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # after all regularizers - compile the computational graph for cost print('Building f_cost...') f_cost = theano.function(inps, cost, profile=profile) print('Done') print('Computing gradient...') grads = tensor.grad(cost, wrt=itemlist(tparams)) print('Done') # apply gradient clipping here if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print('Building optimizers...') f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print('Done') print('Optimization') best_p = None bad_counter = 0 uidx = 0 estop = False history_errs = [] # reload history if reload_ and os.path.exists(saveto): rmodel = numpy.load(saveto) history_errs = list(rmodel['history_errs']) if 'uidx' in rmodel: uidx = rmodel['uidx'] if validFreq == -1: validFreq = len(train[0])/batch_size if saveFreq == -1: saveFreq = len(train[0])/batch_size if sampleFreq == -1: sampleFreq = len(train[0])/batch_size for eidx in range(max_epochs): n_samples = 0 for x in train: n_samples += len(x) uidx += 1 use_noise.set_value(1.) x, x_mask, y, y_mask = prepare_data(x, numofs = numofs) if x is None: print('Minibatch with zero sample') uidx -= 1 continue ud_start = time.time() # compute cost, grads and copy grads to shared variables cost = f_grad_shared(x, x_mask, y, y_mask) # do the update on parameters f_update(lrate) ud = time.time() - ud_start # check for bad numbers, usually we remove non-finite elements # and continue training - but not done here if numpy.isnan(cost) or numpy.isinf(cost): print('NaN detected') return 1., 1., 1. # verbose if numpy.mod(uidx, dispFreq) == 0: print('Epoch %d Update %d Cost %f' % (eidx, uidx, cost)) # generate some samples with the model and display them if numpy.mod(uidx, sampleFreq) == 0: # FIXME: random selection? for jj in range(numpy.minimum(5, x.shape[1])): sample, score = gen_sample(tparams, f_init, f_next, x[:, jj][:, None], model_options, maxlen=7, argmax=False) print 'Source %d: ' % jj, for vv in x[:, jj]: if vv in charDict_r: print charDict_r[vv], else: print 'UNK', print print 'Truth %d: ' % jj, for vv in y[:, jj]: if vv in charDict_r: print charDict_r[vv], else: print 'UNK', print print 'Sample %d: ' % jj, ss = sample for vv in ss: if vv in charDict_r: print charDict_r[vv], else: print 'UNK', print # validate model on validation set and early stop if necessary if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid, numofs = numofs) valid_err = valid_errs.mean() history_errs.append(valid_err) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip(tparams) bad_counter = 0 if len(history_errs) > patience and valid_err >= \ numpy.array(history_errs)[:-patience].min(): bad_counter += 1 if bad_counter > patience: print('Early Stop!') estop = True break if numpy.isnan(valid_err): ipdb.set_trace() print('Valid %f' % valid_err) # save the best model so far, in addition, save the latest model # into a separate file with the iteration number for external eval if numpy.mod(uidx, saveFreq) == 0: print('Saving the best model...') if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print('Done') # save with uidx if not overwrite: print('Saving the model at iteration {}...'.format(uidx)) saveto_uidx = '{}.iter{}.npz'.format( os.path.splitext(saveto)[0], uidx) numpy.savez(saveto_uidx, history_errs=history_errs, uidx=uidx, **unzip(tparams)) print('Done') # finish after this many updates if uidx >= finish_after: print('Finishing after %d iterations!' % uidx) estop = True break print('Seen %d samples' % n_samples) if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) valid_err = pred_probs(f_log_probs, prepare_data, model_options, valid).mean() print('Valid %f' % valid_err) params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, uidx=uidx, **params) return valid_err
# flag to remember when to change the learning rate flag = 0 # exprate exprate = 0 # worddicts worddicts = load_dict(dictionaries[0]) worddicts_r = [None] * len(worddicts) for kk, vv in worddicts.items(): worddicts_r[vv] = kk #load train data and test data train, train_label = dataIterator(datasets[0], datasets[1], worddicts, batch_size=1, batch_Imagesize=batch_Imagesize, maxlen=maxlen, maxImagesize=maxImagesize) len_train = len(train) test, test_label = dataIterator(valid_datasets[0], valid_datasets[1], worddicts, batch_size=1, batch_Imagesize=batch_Imagesize, maxlen=maxlen, maxImagesize=maxImagesize) len_test = len(test)
def train( dim_word=100, # word vector dimensionality dim_enc=1000, dim_dec=1000, down_sample=0, dim_attention=500, dim_coverage=5, kernel_coverage=121, encoder='gru', decoder='gru_cond', patience=4, # early stopping patience max_epochs=5000, finish_after=10000000, # finish after this many updates dispFreq=100, decay_c=0., # L2 regularization penalty alpha_c=0., # alignment regularization clip_c=-1., # gradient clipping threshold lrate=1e-8, # learning rate dim_target=62, # source vocabulary size dim_feature=123, # target vocabulary size maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size=16, valid_batch_size=16, saveto='model.npz', validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq datasets=['feature.pkl', 'label.txt'], valid_datasets=['feature_valid.pkl', 'label_valid.txt'], dictionaries=['lexicon.txt'], valid_output=['decode.txt'], valid_result=['result.txt'], use_dropout=False, reload_=False): # Model options model_options = locals().copy() # load dictionaries and invert them worddicts = load_dict(dictionaries[0]) worddicts_r = [None] * len(worddicts) for kk, vv in worddicts.iteritems(): worddicts_r[vv] = kk # reload options if reload_ and os.path.exists(saveto): with open('%s.pkl' % saveto, 'rb') as f: models_options = pkl.load(f) print 'Loading data' train, train_uid_list = dataIterator(datasets[0], datasets[1], worddicts, batch_size=batch_size, maxlen=maxlen) valid, valid_uid_list = dataIterator(valid_datasets[0], valid_datasets[1], worddicts, batch_size=batch_size, maxlen=maxlen) print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask] print 'Buliding sampler' f_init, f_next = build_sampler(tparams, model_options, trng) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) print 'Done' cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # regularize the alpha weights if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ( (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # after all regularizers - compile the computational graph for cost print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' # apply gradient clipping here if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Done' # print model parameters print "Model params:\n{0}".format( pprint.pformat(sorted([p for p in params]))) # end print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = list(numpy.load(saveto)['history_errs']) best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train) if saveFreq == -1: saveFreq = len(train) if sampleFreq == -1: sampleFreq = len(train) uidx = 0 estop = False halfLrFlag = 0 bad_counter = 0 ud_s = 0 ud_epoch = 0 cost_s = 0. for eidx in xrange(max_epochs): n_samples = 0 random.shuffle(train) # shuffle data ud_epoch_start = time.time() for x, y in train: n_samples += len(x) uidx += 1 use_noise.set_value(1.) ud_start = time.time() x, x_mask, y, y_mask = prepare_data(model_options, x, y, maxlen=maxlen) if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue # compute cost, grads and copy grads to shared variables cost = f_grad_shared(x, x_mask, y, y_mask) cost_s += cost # do the update on parameters f_update(lrate) ud = time.time() - ud_start ud_s += ud # check for bad numbers, usually we remove non-finite elements # and continue training - but not done here if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. # verbose if numpy.mod(uidx, dispFreq) == 0: ud_s /= 60. cost_s /= dispFreq print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost_s, 'UD ', ud_s, 'epson ', lrate, 'bad_counter', bad_counter ud_s = 0 cost_s = 0. # save the best model so far if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' # generate some samples with the model and display them if numpy.mod(uidx, sampleFreq) == 0: # FIXME: random selection? fpp_sample = open(valid_output[0], 'w') valid_count_idx = 0 # FIXME: random selection? for x, y in valid: for xx in x: xx_pad = numpy.zeros((xx.shape[0] + 1, xx.shape[1]), dtype='float32') xx_pad[:xx.shape[0], :] = xx stochastic = False sample, score = gen_sample(tparams, f_init, f_next, xx_pad[:, None, :], model_options, trng=trng, k=10, maxlen=1000, stochastic=stochastic, argmax=False) if stochastic: ss = sample else: score = score / numpy.array( [len(s) for s in sample]) ss = sample[score.argmin()] fpp_sample.write(valid_uid_list[valid_count_idx]) valid_count_idx = valid_count_idx + 1 for vv in ss: if vv == 0: # <eol> break fpp_sample.write(' ' + worddicts_r[vv]) fpp_sample.write('\n') fpp_sample.close() print 'valid set decode done' ud_epoch = (time.time() - ud_epoch_start) / 60. print 'cost time ... ', ud_epoch # validate model on validation set and early stop if necessary if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err_cost = valid_errs.mean() # compute wer os.system('python compute-wer.py ' + valid_output[0] + ' ' + valid_datasets[1] + ' ' + valid_result[0]) fpp = open(valid_result[0]) stuff = fpp.readlines() fpp.close() m = re.search('WER (.*)\n', stuff[0]) valid_per = 100. * float(m.group(1)) m = re.search('ExpRate (.*)\n', stuff[1]) valid_sacc = 100. * float(m.group(1)) #valid_err=0.7*valid_per-0.3*valid_sacc valid_err = valid_per history_errs.append(valid_err) if uidx / validFreq == 0 or valid_err <= numpy.array( history_errs).min( ): # the first time valid or worse model best_p = unzip(tparams) bad_counter = 0 # if len(history_errs) > patience and valid_err >= \ # numpy.array(history_errs)[:-patience].min(): # bad_counter += 1 # if bad_counter > patience: # print 'Early Stop!' # estop = True # break if uidx / validFreq != 0 and valid_err > numpy.array( history_errs).min(): bad_counter += 1 if bad_counter > patience: if halfLrFlag == 1: print 'Early Stop!' estop = True break else: print 'Lr decay and retrain!' bad_counter = 0 lrate /= 10 params = best_p halfLrFlag += 1 if numpy.isnan(valid_err): #ipdb.set_trace() print 'valid_err nan' print 'Valid WER: %.2f%%, ExpRate: %.2f%%, Cost: %f' % ( valid_per, valid_sacc, valid_err_cost) # finish after this many updates if uidx >= finish_after: print 'Finishing after %d iterations!' % uidx estop = True break print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) valid_err = pred_probs(f_log_probs, prepare_data, model_options, valid).mean() print 'Valid ', valid_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, **params) return valid_err
def main(model, bn_model, dictionary_target, fea, latex, saveto, output, k=5): # load model model_options with open('%s.pkl' % model, 'rb') as f: options = pkl.load(f) # load source dictionary and invert worddicts = load_dict(dictionary_target) worddicts_r = [None] * len(worddicts) for kk, vv in worddicts.iteritems(): worddicts_r[vv] = kk valid, valid_uid_list = dataIterator(fea, latex, worddicts, batch_size=1, batch_Imagesize=500000, maxlen=500, maxImagesize=500000) trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # allocate model parameters params = init_params(options) bn_params = init_bn_params(options) # load model parameters and set theano shared variables params = load_params(model, params) bn_params = load_params(bn_model, bn_params) tparams = init_tparams(params) bn_tparams = init_tparams(bn_params) f_init, f_next = build_sampler(tparams, bn_tparams, options, trng, use_noise) use_noise.set_value(0.) fpp_sample = open(saveto, 'w') valid_count_idx = 0 # FIXME: random selection? print 'Decoding ... ' for x, y in valid: for xx in x: print '%d : %s' % (valid_count_idx + 1, valid_uid_list[valid_count_idx]) xx_pad = numpy.zeros( (xx.shape[0], xx.shape[1], xx.shape[2]), dtype='float32') # input_channels * height * width xx_pad[:, :, :] = xx / 255. stochastic = False sample, score = gen_sample(f_init, f_next, xx_pad[None, :, :, :], options, trng=trng, k=10, maxlen=1000, stochastic=stochastic, argmax=False) if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] fpp_sample.write(valid_uid_list[valid_count_idx]) valid_count_idx = valid_count_idx + 1 for vv in ss: if vv == 0: # <eol> break fpp_sample.write(' ' + worddicts_r[vv]) fpp_sample.write('\n') fpp_sample.close() print 'test set decode done' os.system('python compute-wer.py ' + saveto + ' ' + latex + ' ' + output) fpp = open(output) # %WER 31.63 stuff = fpp.readlines() fpp.close() m = re.search('WER (.*)\n', stuff[0]) valid_per = 100. * float(m.group(1)) m = re.search('ExpRate (.*)\n', stuff[1]) valid_sacc = 100. * float(m.group(1)) print 'Valid WER: %.2f%%, ExpRate: %.2f%%' % (valid_per, valid_sacc)
def main(model, dictionary_target, source_fea, source_latex, saveto, wer_file, k=5): # load model model_options with open('%s.pkl' % model, 'rb') as f: options = pkl.load(f) # load source dictionary and invert worddicts = load_dict(dictionary_target) worddicts_r = [None] * len(worddicts) for kk, vv in worddicts.iteritems(): worddicts_r[vv] = kk valid, valid_uid_list = dataIterator(source_fea, source_latex, worddicts, batch_size=1, maxlen=2000) trng = RandomStreams(1234) params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) f_init, f_next = build_sampler(tparams, options, trng) fpp_sample = open(saveto, 'w') valid_count_idx = 0 print 'Decoding...' ud_epoch = 0 ud_epoch_start = time.time() for x, y in valid: for xx in x: print '%d : %s' % (valid_count_idx + 1, valid_uid_list[valid_count_idx]) xx_pad = numpy.zeros((xx.shape[0] + 1, xx.shape[1]), dtype='float32') xx_pad[:xx.shape[0], :] = xx stochastic = False sample, score = gen_sample(f_init, f_next, xx_pad[:, None, :], options, trng=trng, k=k, maxlen=1000, stochastic=stochastic, argmax=False) if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] fpp_sample.write(valid_uid_list[valid_count_idx]) valid_count_idx = valid_count_idx + 1 for vv in ss: if vv == 0: # <eol> break fpp_sample.write(' ' + worddicts_r[vv]) fpp_sample.write('\n') fpp_sample.close() ud_epoch = (time.time() - ud_epoch_start) / 60. print 'test set decode done, cost time ...', ud_epoch os.system('python compute-wer.py ' + saveto + ' ' + source_latex + ' ' + wer_file) fpp = open(wer_file) stuff = fpp.readlines() fpp.close() m = re.search('WER (.*)\n', stuff[0]) valid_per = 100. * float(m.group(1)) m = re.search('ExpRate (.*)\n', stuff[1]) valid_sacc = 100. * float(m.group(1)) print 'Valid WER: %.2f%%, ExpRate: %.2f%%' % (valid_per, valid_sacc)
def getData( file='ad_action_state', batch_size=32, shuffle_each_epoch=True, ): df = pd.read_csv(file) df['adgroup_id'] = pd.Series(df['adgroup_id'], dtype=np.str) # build dict for _, row in df.iterrows(): adgroup_id = row['adgroup_id'] if adgroup_id not in adgroup_dict: adgroup_dict[adgroup_id] = len(adgroup_dict) effect_info = row['effect_data'].split(';') for s in effect_info: _, s = s.split(':') if _ not in tmp_days_set: tmp_days_set.add(_) s = s.split(',') for t in s: t, _ = t.split('=') if t not in effect_dict: effect_dict[t] = len(effect_dict) pos_info = row['pos_ratio'].split(';') for s in pos_info: _, s = s.split(':') s = s.split(',') for t in s: t, _ = t.split('=') if t not in pos_dict: pos_dict[t] = len(pos_dict) direct_info = row['direct_type_price'].split(';') for s in direct_info: _, s = s.split(':') s = s.split(',') for t in s: t, _ = t.split('=') if t not in direct_dict: direct_dict[t] = len(direct_dict) ad_feature = row['ad_feature'].split(';') for s in ad_feature: _, s = s.split(':') s = s.split(',') for t in s: name, num = t.split('=') if name == 'member_id': if num not in member_dict: member_dict[num] = len(member_dict) elif name == 'campaign_id': if num not in campaign_dict: campaign_dict[num] = len(campaign_dict) elif name == 'adgroup_id': if num not in adgroup_dict: adgroup_dict[num] = len(adgroup_dict) elif name == 'item_id': if num not in item_dict: item_dict[num] = len(item_dict) elif name == 'cate_id': if num not in cate_dict: cate_dict[num] = len(cate_dict) elif name == 'commodity_id': if num not in commodity_dict: commodity_dict[num] = len(commodity_dict) elif name == 'node_id': if num not in node_dict: node_dict[num] = len(node_dict) tmp_days_list = list(tmp_days_set) list.sort(tmp_days_list) for i, x in enumerate(tmp_days_list): days_dict[x] = i print(days_dict) train_set = [] test_set = [] # parse data for _, row in df.iterrows(): data = [] # label data.append(row['label']) data.append(0) # ad feature adgroup_id = get_id(adgroup_dict, row['adgroup_id']) data.append(adgroup_id) ad_feature = (row['ad_feature'].split(';')[0]).split(':')[1] ad_feature = ad_feature.split(',') for x in ad_feature: name, entry = x.split('=') fid = 0 if name == 'member_id': fid = get_id(member_dict, entry) data.append(fid) elif name == 'campaign_id': fid = get_id(campaign_dict, entry) data.append(fid) elif name == 'item_id': fid = get_id(item_dict, entry) data.append(fid) elif name == 'item_price': item_price = float(entry) / 100.0 data.append(item_price) elif name == 'cate_id': fid = get_id(cate_dict, entry) data.append(fid) elif name == 'commodity_id': fid = get_id(commodity_dict, entry) data.append(fid) elif name == 'node_id': fid = get_id(node_dict, entry) data.append(fid) days_num = len(days_dict) # effect data effect_list = [[0.0] * len(effect_dict) for _ in range(days_num)] effect_data = row['effect_data'].split(';') mmax = np.array([0.0 for i in range(len(effect_dict))]) for x in effect_data: day, entry = x.split(':') if day not in days_dict: continue day = days_dict[day] entry = entry.split(',') for o, y in enumerate(entry): name, num = y.split('=') num = float(num) name = get_id(effect_dict, name) mmax[o] = max(mmax[o], num) effect_list[day][name] = num tot_cost = 0 for x in effect_list: tot_cost += x[2] data[1]=tot_cost # normalized for o, x in enumerate(effect_list): effect_arr = np.array(effect_list[o]) effect_arr = 2 * (effect_arr / np.array([max(mmax[i], 0.0000000001) for i in range(len(mmax))]) - 0.5) effect_list[o] = effect_arr.tolist() data.append(effect_list) # pos_ratio pos_list = [[0.0] * len(pos_dict) for _ in range(days_num)] pos_data = row['pos_ratio'].split(';') for x in pos_data: day, entry = x.split(':') if day not in days_dict: continue day = days_dict[day] entry = entry.split(',') for y in entry: name, num = y.split('=') num = float(num) name = get_id(pos_dict, name) pos_list[day][name] = num data.append(pos_list) # direct info direct_list = [[0.0] * len(direct_dict) for _ in range(days_num)] direct_mask = [[0.0] * len(direct_dict) for _ in range(days_num)] direct_data = row['direct_type_price'].split(';') for x in direct_data: day, entry = x.split(':') if day not in days_dict: continue day = days_dict[day] entry = entry.split(',') for y in entry: name, num = y.split('=') num = float(num) name = get_id(direct_dict, name) direct_list[day][name] = num direct_mask[day][name] = 1.0 data.append(direct_list) data.append(direct_mask) # actions info direct_type_list = [[] for _ in range(days_num)] direct_val_list = [[] for _ in range(days_num)] pos_type_list = [[] for _ in range(days_num)] pos_val_list = [[] for _ in range(days_num)] actions_data = row['actions'].split(';') for x in actions_data: day, entry = x.split(':') if day not in days_dict: continue day = days_dict[day] entry = entry.split(',') for y in entry: if len(y) == 0: continue a, b = y.split('-', 1) if a == '修改定向': a, b = b.split('->') aa = a.split('-')[0] # direct type bb = a.split('-')[-1] cc = b.split('-')[0] bb = float(bb) / 100.0 # old price cc = float(cc) / 100.0 # new price if len(direct_type_list[day]) < 100: direct_type_list[day].append(get_id(direct_dict, aa)) #direct_val_list[day].append(1) direct_val_list[day].append(cc - bb) if a == '新增定向': b = b.split('-') aa = b[0] bb = b[-2] bb = float(bb) / 100.0 if len(direct_type_list[day]) < 100: direct_type_list[day].append(len(direct_dict) + get_id(direct_dict, aa)) #direct_val_list[day].append(1) direct_val_list[day].append(bb) if a == '移除定向': b = b.split('-') aa = b[0] bb = b[-2] bb = float(bb) / 100.0 if len(direct_type_list[day]) < 100: direct_type_list[day].append(len(direct_dict) + len(direct_dict) + get_id(direct_dict, aa)) #direct_val_list[day].append(1) direct_val_list[day].append(bb) if a == '新增资源位': b = b.split('-') aa = b[0] bb = b[2] bb = float(bb) / 100.0 if len(pos_type_list[day]) < 100: if aa == '23': pos_type_list[day].append(0) #pos_val_list[day].append(1) pos_val_list[day].append(bb) if aa == '24': pos_type_list[day].append(1) #pos_val_list[day].append(1) pos_val_list[day].append(bb) if aa == '25': pos_type_list[day].append(2) #pos_val_list[day].append(1) pos_val_list[day].append(bb) if a == '修改资源位': a, b = b.split('->') aa = a.split('-')[0] bb = a.split('-')[-1] cc = b.split('-')[0] bb = float(bb) / 100.0 cc = float(cc) / 100.0 if len(pos_type_list[day]) < 100: if aa == '23': pos_type_list[day].append(3) #pos_val_list[day].append(1) pos_val_list[day].append(cc - bb) if aa == '24': pos_type_list[day].append(4) #pos_val_list[day].append(1) pos_val_list[day].append(cc - bb) if aa == '25': pos_type_list[day].append(5) #pos_val_list[day].append(1) pos_val_list[day].append(cc - bb) if a == '移除资源位': b = b.split('-') aa = b[0] bb = b[2] bb = float(bb) / 100.0 if len(pos_type_list[day]) < 100: if aa == '23': pos_type_list[day].append(6) #pos_val_list[day].append(1) pos_val_list[day].append(bb) if aa == '24': pos_type_list[day].append(7) #pos_val_list[day].append(1) pos_val_list[day].append(bb) if aa == '25': pos_type_list[day].append(8) #pos_val_list[day].append(1) pos_val_list[day].append(bb) data.append(direct_type_list) data.append(direct_val_list) data.append(pos_type_list) data.append(pos_val_list) data_type = row['version'] if data_type == 'train': train_set.append(data) elif data_type == 'test': test_set.append(data) print("train num: %d." % len(train_set)) print("test num: %d." % len(test_set)) train_data = dataIterator(train_set, batch_size=batch_size, shuffle_each_epoch=shuffle_each_epoch) test_data = dataIterator(test_set, batch_size=batch_size, shuffle_each_epoch=False) return train_data, test_data
print('total chars', len(worddicts)) worddicts_r = [None] * len(worddicts) for kk, vv in worddicts.items(): worddicts_r[vv] = kk reworddicts = load_dict(dictionaries[1]) print('total relations', len(reworddicts)) reworddicts_r = [None] * len(reworddicts) for kk, vv in reworddicts.items(): reworddicts_r[vv] = kk train, train_uid_list = dataIterator(datasets[0], datasets[1], datasets[2], worddicts, reworddicts, batch_size=batch_size, batch_Imagesize=batch_Imagesize, maxlen=maxlen, maxImagesize=maxImagesize) valid, valid_uid_list = dataIterator(valid_datasets[0], valid_datasets[1], valid_datasets[2], worddicts, reworddicts, batch_size=valid_batch_size, batch_Imagesize=valid_batch_Imagesize, maxlen=maxlen, maxImagesize=maxImagesize) # display uidx = 0 # count batch