def test(cws,filename,output_path): def seg(char_seq,text): lens = cws.forward(char_seq) res, begin =[], 0 for wlen in lens: res.append(''.join(text[begin:begin+wlen])) begin+=wlen return res char_seqs = prepareData(cws.character_idx_map,filename,test=True) fo = open(output_path,'wb') seq_idx = 0 for line in open(filename).readlines(): sent = unicode(line.decode('utf8')).split() Left = 0 output_sent = [] for idx,word in enumerate(sent): if len(re.sub('\W','',word,flags=re.U))==0: if idx>Left: words =seg(char_seqs[seq_idx],list(''.join(sent[Left:idx]))) seq_idx += 1 output_sent.extend(words) Left = idx+1 output_sent.append(word) if Left!=len(sent): words = seg(char_seqs[seq_idx],list(''.join(sent[Left:]))) seq_idx += 1 output_sent.extend(words) output_sent = ' '.join(output_sent).encode('utf8')+'\r\n' fo.write(output_sent) fo.close()
def test(cws, filename, output_path): def seg(char_seq, text): lens = cws.forward(char_seq) res, begin = [], 0 for wlen in lens: res.append(''.join(text[begin:begin + wlen])) begin += wlen return res char_seqs = prepareData(cws.character_idx_map, filename, test=True) #fo = open(output_path,'wb') fo = open(output_path, 'w') seq_idx = 0 for line in open(filename).readlines(): #sent = str(line.decode('utf8')).split() sent = line.split() Left = 0 output_sent = [] for idx, word in enumerate(sent): if len(re.sub('\W', '', word, flags=re.U)) == 0: if idx > Left: words = seg(char_seqs[seq_idx], list(''.join(sent[Left:idx]))) seq_idx += 1 output_sent.extend(words) Left = idx + 1 output_sent.append(word) if Left != len(sent): words = seg(char_seqs[seq_idx], list(''.join(sent[Left:]))) seq_idx += 1 output_sent.extend(words) #output_sent = ' '.join(output_sent).encode('utf8')+'\r\n' output_sent = ' '.join(output_sent) + '\r\n' fo.write(output_sent) fo.close()
def test(character_idx_map, options, params, path, filename, batch_size=512): X = tools.prepareData(character_idx_map, path, test=True) dropout = (1 - options['dropout_rate']) * np.ones( (options['ndims'], ), dtype=theano.config.floatX) start, n = 0, len(X) idx_list = range(n) lens = [len(x) for x in X] idx_list = sorted(idx_list, cmp=lambda x, y: cmp(lens[x], lens[y])) Y = [] print 'count_test_sentences', len(X) for i in range(n // batch_size): batch_idx = idx_list[start:start + batch_size] x = [X[t] for t in batch_idx] x_lens = [lens[t] for t in batch_idx] x = tools.asMatrix(x) sY = tools.segment(params, options, x, x_lens, dropout) Y.extend(sY) start += batch_size if start != n: batch_idx = idx_list[start:] x = [X[t] for t in batch_idx] x_lens = [lens[t] for t in batch_idx] x = tools.asMatrix(x) sY = tools.segment(params, options, x, x_lens, dropout) Y.extend(sY) table = {} nb = 0 for idx in idx_list: table[idx] = nb nb += 1 output_result(Y, table, path, filename)
def main(): # Model Parameters degree = 13 whis = 2.5 lambda_ = 0.0001 # Load the training data print("Loading the training Datas...") y, tX, ids = load_csv_data(DATA_TRAIN_PATH) # Clean and prepare our data print("Clean and prepare the training datas...") y_train, tX_train, ids_train = prepareData(y, tX, ids, degree, whis) # Train our models print("Train the models...") weights_0, loss_0 = ridge_regression(y_train[0], tX_train[0], lambda_) weights_1, loss_1 = ridge_regression(y_train[1], tX_train[1], lambda_) weights_2, loss_2 = ridge_regression(y_train[2], tX_train[2], lambda_) weights_3, loss_3 = ridge_regression(y_train[3], tX_train[3], lambda_) # Load the dataset to predict print("Loading the testing Datas...") y_test, tX_test, ids_test = load_csv_data(DATA_TEST_PATH) # Prepare the data in the same way as the train dataset print("Clean and prepare the testing datas...") y_test, tX_test, ids_test = prepareData(y_test, tX_test, ids_test, degree, whis) # Predict each class print("Predict the testing datas...") y_pred_0 = predict_labels(weights_0, tX_test[0]) y_pred_1 = predict_labels(weights_1, tX_test[1]) y_pred_2 = predict_labels(weights_2, tX_test[2]) y_pred_3 = predict_labels(weights_3, tX_test[3]) # Concatenate the results y_pred = np.concatenate([y_pred_0, y_pred_1, y_pred_2, y_pred_3]) ids_test = np.concatenate( [ids_test[0], ids_test[1], ids_test[2], ids_test[3]]) # Write the results in a csv file print("Writing the results...") create_csv_submission(ids_test, y_pred, OUTPUT_PATH) print("DONE!, your predictions are available in ", OUTPUT_PATH)
def test(character_idx_map, options, params, path, filename, batch_size = 512 ): X = tools.prepareData(character_idx_map,path,test=True) dropout = (1-options['dropout_rate'])*np.ones((options['ndims'],), dtype=theano.config.floatX) start,n = 0,len(X) idx_list = range(n) lens = [len(x) for x in X] idx_list = sorted(idx_list,cmp = lambda x,y: cmp(lens[x],lens[y])) Y = [] print 'count_test_sentences',len(X) for i in range(n//batch_size): batch_idx = idx_list[start:start+batch_size] x = [X[t] for t in batch_idx] x_lens = [lens[t] for t in batch_idx] x = tools.asMatrix(x) sY = tools.segment(params,options,x,x_lens,dropout) Y.extend(sY) start+=batch_size if start!=n: batch_idx = idx_list[start:] x = [X[t] for t in batch_idx] x_lens = [lens[t] for t in batch_idx] x = tools.asMatrix(x) sY = tools.segment(params,options,x,x_lens,dropout) Y.extend(sY) table = {} nb= 0 for idx in idx_list: table[idx] = nb nb+=1 output_result(Y,table,path,filename)
def dy_train_model(max_epochs=50, batch_size=256, ndims=50, nhiddens=50, dropout_rate=0.2, regularization=0.000001, margin_loss_discount=0.2, max_word_len=4, start_point=1, load_params=None, max_sent_len=60, beam_size=4, shuffle_data=True, train_file='../data/train', dev_file='../data/dev', lr=0.2, pre_training='../w2v/c_vecs_50'): options = locals().copy() print 'Model options:' for kk, vv in options.iteritems(): print '\t', kk, '\t', vv Cemb, character_idx_map = initCemb(ndims, train_file, pre_training) cws = CWS(Cemb, character_idx_map, options) if load_params is not None: cws.load(load_params) char_seq, _, truth = prepareData(character_idx_map, train_file) if max_sent_len is not None: survived = [] for idx, seq in enumerate(char_seq): if len(seq) <= max_sent_len and len(seq) > 1: survived.append(idx) char_seq = [char_seq[idx] for idx in survived] truth = [truth[idx] for idx in survived] n = len(char_seq) print 'Total number of training instances:', n print 'Start training model' start_time = time.time() nsamples = 0 for eidx in xrange(max_epochs): idx_list = range(n) if shuffle_data: random.shuffle(idx_list) for idx in idx_list: loss = cws.backward(char_seq[idx], truth[idx]) if np.isnan(loss): print 'somthing went wrong, loss is nan.' return nsamples += 1 if nsamples % batch_size == 0: cws.trainer.update(1. / batch_size) cws.trainer.update_epoch(1.) end_time = time.time() print 'Trained %s epoch(s) (%d samples) took %.lfs per epoch' % ( eidx + 1, nsamples, (end_time - start_time) / (eidx + 1)) test(cws, dev_file, '../result/dev_result%d' % (eidx + start_point))
def dy_train_model(max_epochs=30, batch_size=256, char_dims=50, word_dims=100, nhiddens=50, dropout_rate=0.2, margin_loss_discount=0.2, max_word_len=4, load_params=None, max_sent_len=60, shuffle_data=True, train_file='../data/train', dev_file='../data/dev', lr=0.5, edecay=0.1, momentum=0.5, pre_trained='../w2v/char_vecs_100', word_proportion=0.5): options = locals().copy() # Copy the local parameters to options print 'Model options:' for kk, vv in options.iteritems(): print '\t', kk, '\t', vv # Based on the train_file, get the most frequent characters to generate matrix # Cemb: Character embedding matrix, {index, vector} # character_idx_map: {character:index} Cemb, character_idx_map = initCemb(char_dims, train_file, pre_trained) # Define parameters, trainer cws = CWS(Cemb, character_idx_map, options) # Load prams adn test if load_params is not None: cws.load(load_params) test(cws, dev_file, 'result') # Convert word corpus to sentence, index list # char_seq: [sentence[cha_idx]] # truth: [sentence[char_label]] char_seq, _, truth = prepareData(character_idx_map, train_file) # Remove too long or null sentence if max_sent_len is not None: survived = [] for idx, seq in enumerate(char_seq): if len(seq) <= max_sent_len and len(seq) > 1: survived.append(idx) char_seq = [char_seq[idx] for idx in survived] truth = [truth[idx] for idx in survived] # Generate frequent word matrix H if word_proportion > 0: word_counter = Counter() # Loop characters in sentence for chars, labels in zip(char_seq, truth): # Loop labels from 1 to n # Generate tuple word index combinations (idx) # Count the number of occurrence word_counter.update( tuple(chars[idx - label:idx]) for idx, label in enumerate(labels, 1)) # Put most frequent word in list known_word_count = int(word_proportion * len(word_counter)) known_words = dict(word_counter.most_common() [:known_word_count]) # {idx: number of occurrence} idx = 0 # Set know_words to {idx1:idx2} for word in known_words: known_words[word] = idx idx += 1 # we keep a short list H of the most frequent words, generate parameter matrix H # Add known_words and param['word_embed'] as lookup_parameters cws.use_word_embed(known_words) n = len(char_seq) print 'Total number of training instances:', n print 'Start training model' start_time = time.time() nsamples = 0 for eidx in xrange(max_epochs): idx_list = range(n) # Random the sentences if shuffle_data: np.random.shuffle(idx_list) total_loss = 0 total_times = 0 for idx in idx_list: loss = cws.backward(char_seq[idx], truth[idx]) # Construct computation graph total_loss += loss if np.isnan(loss): print 'somthing went wrong, loss is nan.' return nsamples += 1 if nsamples % batch_size == 0: cws.trainer.update() total_times += batch_size print '%s/%s, average loss:%s' % (total_times, n, total_loss / batch_size) total_loss = 0 # edecay is not avaiable after dynet 1.0 # I have to manually update learning rate cws.trainer.learning_rate /= 1 + options['edecay'] total_times = 0 # Deprecated # cws.trainer.update_epoch(1.) end_time = time.time() print 'Trained %s epoch(s) (%d samples) took %.lfs per epoch' % ( eidx + 1, nsamples, (end_time - start_time) / (eidx + 1)) test(cws, dev_file, '../result/dev_result%d' % (eidx + 1)) os.system('python score.py %s %d %d' % (dev_file, eidx + 1, eidx + 1)) cws.save('epoch%d' % (eidx + 1)) print 'Current model saved'
def dy_train_model( max_epochs = 50, batch_size = 256, ndims = 50, nhiddens = 50, dropout_rate = 0.2, regularization = 0.000001, margin_loss_discount = 0.2, max_word_len = 4, start_point = 1, load_params = None, max_sent_len = 60, beam_size = 4, shuffle_data = True, train_file = '../data/train', dev_file = '../data/dev', lr = 0.2, pre_training = '../w2v/c_vecs_50' ): options = locals().copy() print 'Model options:' for kk,vv in options.iteritems(): print '\t',kk,'\t',vv Cemb, character_idx_map = initCemb(ndims,train_file,pre_training) cws = CWS(Cemb,character_idx_map,options) if load_params is not None: cws.load(load_params) char_seq, _ , truth = prepareData(character_idx_map,train_file) if max_sent_len is not None: survived = [] for idx,seq in enumerate(char_seq): if len(seq)<=max_sent_len and len(seq)>1: survived.append(idx) char_seq = [ char_seq[idx] for idx in survived] truth = [ truth[idx] for idx in survived] n = len(char_seq) print 'Total number of training instances:',n print 'Start training model' start_time = time.time() nsamples = 0 for eidx in xrange(max_epochs): idx_list = range(n) if shuffle_data: random.shuffle(idx_list) for idx in idx_list: loss = cws.backward(char_seq[idx],truth[idx]) if np.isnan(loss): print 'somthing went wrong, loss is nan.' return nsamples += 1 if nsamples % batch_size == 0: cws.trainer.update(1./batch_size) cws.trainer.update_epoch(1.) end_time = time.time() print 'Trained %s epoch(s) (%d samples) took %.lfs per epoch'%(eidx+1,nsamples,(end_time-start_time)/(eidx+1)) test(cws,dev_file,'../result/dev_result%d'%(eidx+start_point))
def dy_train_model(max_epochs=30, batch_size=256, char_dims=50, word_dims=100, nhiddens=50, dropout_rate=0.2, margin_loss_discount=0.2, max_word_len=4, load_params=None, max_sent_len=60, shuffle_data=True, train_file='../data/train', dev_file='../data/dev', lr=0.5, edecay=0.1, momentum=0.5, pre_trained='../w2v/char_vecs_100', word_proportion=0.5): options = locals().copy() print 'Model options:' for kk, vv in options.iteritems(): print '\t', kk, '\t', vv Cemb, character_idx_map = initCemb(char_dims, train_file, pre_trained) cws = CWS(Cemb, character_idx_map, options) if load_params is not None: cws.load(load_params) test(cws, dev_file, 'result') char_seq, _, truth = prepareData(character_idx_map, train_file) if max_sent_len is not None: survived = [] for idx, seq in enumerate(char_seq): if len(seq) <= max_sent_len and len(seq) > 1: survived.append(idx) char_seq = [char_seq[idx] for idx in survived] truth = [truth[idx] for idx in survived] if word_proportion > 0: word_counter = Counter() for chars, labels in zip(char_seq, truth): word_counter.update( tuple(chars[idx - label:idx]) for idx, label in enumerate(labels, 1)) known_word_count = int(word_proportion * len(word_counter)) known_words = dict(word_counter.most_common()[:known_word_count]) idx = 0 for word in known_words: known_words[word] = idx idx += 1 cws.use_word_embed(known_words) n = len(char_seq) print 'Total number of training instances:', n print 'Start training model' start_time = time.time() nsamples = 0 for eidx in xrange(max_epochs): idx_list = range(n) if shuffle_data: np.random.shuffle(idx_list) for idx in idx_list: loss = cws.backward(char_seq[idx], truth[idx]) if np.isnan(loss): print 'somthing went wrong, loss is nan.' return nsamples += 1 if nsamples % batch_size == 0: cws.trainer.update(1.) cws.trainer.update_epoch(1.) end_time = time.time() print 'Trained %s epoch(s) (%d samples) took %.lfs per epoch' % ( eidx + 1, nsamples, (end_time - start_time) / (eidx + 1)) test(cws, dev_file, '../result/dev_result%d' % (eidx + 1)) os.system('python score.py %s %d %d' % (dev_file, eidx + 1, eidx + 1)) cws.save('epoch%d' % (eidx + 1)) print 'Current model saved'
def train_model(max_epoches=30, optimizer=adadelta, batch_size=256, ndims=100, nhiddens=150, dropout_rate=0., regularization=0., margin_loss_discount=0.2, max_word_len=4, start_point=1, load_params=None, resume_training=False, max_sent_len=60, beam_size=4, shuffle_data=True, train_file='../data/train', dev_file='../data/dev', lr=0.2, pre_training='../w2v/c_vecs_100'): options = locals().copy() print 'model options:', options print 'Building model' Cemb, character_idx_map = tools.initCemb(ndims, train_file, pre_training) print '%saving config file' config = {} config['options'] = options config['options']['optimizer'] = optimizer.__name__ config['character_idx_map'] = character_idx_map f = open('config', 'wb') f.write(json.dumps(config)) f.close() print '%resume model building' params = initParams(Cemb, options) if load_params is not None: pp = np.load(load_params) for kk, vv in params.iteritems(): if kk not in pp: raise Warning('%s is not in the archive' % kk) params[kk] = pp[kk] tparams = initTparams(params) if optimizer is adadelta: ms_up, ms_grad = prepare_adadelta(tparams) if optimizer is adagrad: if resume_training: ss_grad = initTparams(np.load('backup.npz')) else: ss_grad = prepare_adagrad(tparams) T_x, T_dropout, T_y, T_yy, T_y_mask, T_yy_mask, T_cost = build_model( tparams, options) weight_decay = (tparams['U']**2).sum() + (tparams['Wy']**2).sum() weight_decay *= regularization T_cost += weight_decay if optimizer is adadelta: T_updates = optimizer(ms_up, ms_grad, tparams, T_cost) elif optimizer is sgd: LR, T_updates = optimizer(tparams, T_cost, lr) elif optimizer is adagrad: T_updates = optimizer(ss_grad, tparams, T_cost, lr) f_update = theano.function( [T_x, T_dropout, T_y, T_yy, T_y_mask, T_yy_mask], T_cost, updates=T_updates) print 'Loading data' seqs, lenss, tagss = tools.prepareData(character_idx_map, train_file) if max_sent_len is not None: survived = [] for idx, seq in enumerate(seqs): if len(seq) <= max_sent_len and len(seq) > 1: survived.append(idx) seqs = [seqs[idx] for idx in survived] lenss = [lenss[idx] for idx in survived] tagss = [tagss[idx] for idx in survived] tot_lens = [len(seq) for seq in seqs] print 'count_training_sentences', len(seqs) print 'Training model' start_time = time.time() for eidx in xrange(max_epoches): batches_idx = get_minibatches_idx(seqs, tot_lens, batch_size, shuffle=shuffle_data) for batch_idx in batches_idx: X = [seqs[t] for t in batch_idx] Y = [lenss[t] for t in batch_idx] Z = [tagss[t] for t in batch_idx] X_lens = [tot_lens[t] for t in batch_idx] params = get_params(tparams) X = tools.asMatrix(X) dropout = np.random.binomial(1, 1 - dropout_rate, (X.shape[1], ndims)).astype( theano.config.floatX) #numpy_start = time.time() YY = tools.segment(params, options, X, X_lens, dropout, margin_loss_discount, Z) #print 'numpy',time.time()-numpy_start Y = tools.asMatrix(Y, transpose=True) YY = tools.asMatrix(YY, transpose=True) Y_mask = (Y / Y).astype(theano.config.floatX) YY_mask = (YY / YY).astype(theano.config.floatX) #theano_start = time.time() f_update(X, dropout, Y, YY, Y_mask, YY_mask) #print 'theano',time.time()-theano_start if optimizer is sgd: LR.set_value(numpy_floatX(LR.get_value() * 0.9)) params = get_params(tparams) test(config['character_idx_map'], config['options'], params, dev_file, '../result/dev_result%s' % (eidx + start_point, )) np.savez('epoch_%s' % (eidx + start_point, ), **params) if optimizer is adagrad: np.savez('backup', **get_params(ss_grad)) end_time = time.time() print 'Trained %s epoch(s) took %.lfs per epoch' % ( eidx + 1, (end_time - start_time) / (eidx + 1))
def train_model( max_epochs = 30, optimizer = adadelta, batch_size = 256, ndims = 100, nhiddens = 150, dropout_rate = 0., regularization = 0., margin_loss_discount = 0.2, max_word_len = 4, start_point = 1, load_params = None, resume_training = False, max_sent_len = 60, beam_size = 4, shuffle_data = True, train_file = '../data/train', dev_file = '../data/dev', lr = 0.2, pre_training = '../w2v/c_vecs_100' ): options = locals().copy() print 'model options:',options print 'Building model' Cemb,character_idx_map = tools.initCemb(ndims,train_file,pre_training) print '%saving config file' config = {} config['options'] = options config['options']['optimizer'] = optimizer.__name__ config['character_idx_map'] = character_idx_map f = open('config','wb') f.write(json.dumps(config)) f.close() print '%resume model building' params = initParams(Cemb,options) if load_params is not None: pp = np.load(load_params) for kk,vv in params.iteritems(): if kk not in pp: raise Warning('%s is not in the archive' % kk) params[kk] = pp[kk] tparams = initTparams(params) if optimizer is adadelta: ms_up,ms_grad = prepare_adadelta(tparams) if optimizer is adagrad: if resume_training: ss_grad = initTparams(np.load('backup.npz')) else: ss_grad = prepare_adagrad(tparams) T_x,T_dropout,T_y,T_yy,T_y_mask,T_yy_mask,T_cost = build_model(tparams,options) weight_decay = (tparams['U']**2).sum()+(tparams['Wy']**2).sum() weight_decay *= regularization T_cost += weight_decay if optimizer is adadelta: T_updates = optimizer(ms_up,ms_grad,tparams,T_cost) elif optimizer is sgd: LR,T_updates = optimizer(tparams,T_cost,lr) elif optimizer is adagrad: T_updates = optimizer(ss_grad,tparams,T_cost,lr) f_update = theano.function([T_x,T_dropout,T_y,T_yy,T_y_mask,T_yy_mask],T_cost,updates=T_updates) print 'Loading data' seqs,lenss,tagss = tools.prepareData(character_idx_map,train_file) if max_sent_len is not None: survived = [] for idx,seq in enumerate(seqs): if len(seq)<=max_sent_len and len(seq)>1: survived.append(idx) seqs = [ seqs[idx] for idx in survived] lenss = [ lenss[idx] for idx in survived] tagss = [ tagss[idx] for idx in survived] tot_lens = [len(seq) for seq in seqs] print 'count_training_sentences',len(seqs) print 'Training model' start_time = time.time() for eidx in xrange(max_epochs): batches_idx = get_minibatches_idx(seqs,tot_lens,batch_size,shuffle=shuffle_data) for batch_idx in batches_idx: X = [seqs[t] for t in batch_idx] Y = [lenss[t] for t in batch_idx] Z = [tagss[t] for t in batch_idx] X_lens = [tot_lens[t] for t in batch_idx] params = get_params(tparams) X = tools.asMatrix(X) dropout = np.random.binomial(1,1-dropout_rate,(X.shape[1],ndims)).astype(theano.config.floatX) #numpy_start = time.time() YY= tools.segment(params,options,X,X_lens,dropout,margin_loss_discount,Z) #print 'numpy',time.time()-numpy_start Y = tools.asMatrix(Y,transpose=True) YY = tools.asMatrix(YY,transpose=True) Y_mask = (Y/Y).astype(theano.config.floatX) YY_mask =(YY/YY).astype(theano.config.floatX) #theano_start = time.time() f_update(X,dropout,Y,YY,Y_mask,YY_mask) #print 'theano',time.time()-theano_start if optimizer is sgd: LR.set_value(numpy_floatX(LR.get_value()*0.9)) params = get_params(tparams) test(config['character_idx_map'],config['options'],params,dev_file,'../result/dev_result%s'%(eidx+start_point,)) np.savez('epoch_%s'%(eidx+start_point,),**params) if optimizer is adagrad: np.savez('backup',**get_params(ss_grad)) end_time = time.time() print 'Trained %s epoch(s) took %.lfs per epoch'%(eidx+1,(end_time-start_time)/(eidx+1))