def main(args): data_path = args[0] model_path = args[1] save_path = args[2] if len(args)>3: m_num = int(args[3]) print("Preparing Data...") # Test data Xt = [] with io.open(data_path,'r',encoding='utf-8') as f: # for line in f: # Xc = line.rstrip('\n') # Xt.append(Xc[:MAX_LENGTH]) ###Input dataset with tweet+emoji instead of just tweets data = csv.reader(f, delimiter=',', quotechar="|") for line in data: Xc = line[0].rstrip('\n') Xt.append(Xc[:MAX_LENGTH]) # Model print("Loading model params...") if len(args)>3: params = load_params('%s/model_%d.npz' % (model_path,m_num)) else: params = load_params('%s/best_model.npz' % model_path) print("Loading dictionaries...") with open('%s/dict.pkl' % model_path, 'rb') as f: chardict = pkl.load(f) with open('%s/label_dict.pkl' % model_path, 'rb') as f: labeldict = pkl.load(f) n_char = len(chardict.keys()) + 1 n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES) inverse_labeldict = invert(labeldict) print("Building network...") # Tweet variables tweet = T.itensor3() t_mask = T.fmatrix() # network for prediction predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char) # Theano function print("Compiling theano functions...") predict = theano.function([tweet,t_mask],predictions) encode = theano.function([tweet,t_mask],embeddings) # Test print("Encoding...") out_pred = [] out_emb = [] numbatches = len(Xt)/N_BATCH + 1 for i in range(numbatches): xr = Xt[N_BATCH*i:N_BATCH*(i+1)] x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char) p = predict(x,x_m) e = encode(x,x_m) ranks = np.argsort(p)[:,::-1] for idx, item in enumerate(xr): out_pred.append(' '.join([inverse_labeldict[r] if r in inverse_labeldict else 'UNK' for r in ranks[idx,:5]])) out_emb.append(e[idx,:]) # Save print("Saving...") ###Author - jagathshree if not os.path.exists(save_path): os.makedirs(save_path) with io.open('%s/predicted_tags.txt'%save_path,'w') as f: for item in out_pred: f.write(item + '\n') with open('%s/embeddings.npy'%save_path,'w') as f: np.save(f,np.asarray(out_emb))
def main(train_path, val_path, save_path, num_epochs=50): global T1 print('NUM EPOCHS %i' % num_epochs) # save settings shutil.copyfile('settings_char.py', '%s/settings_char.txt' % save_path) print("Preparing Data 123") # Training data Xt = [] yt = [] with io.open(train_path, 'r', encoding='utf-8') as f: for line in f: (yc, Xc) = line.rstrip('\n').split('\t') Xt.append(Xc[:MAX_LENGTH]) yt.append(yc) # Validation data Xv = [] yv = [] with io.open(val_path, 'r', encoding='utf-8') as f: for line in f: (yc, Xc) = line.rstrip('\n').split('\t') Xv.append(Xc[:MAX_LENGTH]) yv.append(yc.split(',')) print("Building Model...") if not RELOAD_MODEL: # Build dictionaries from training data chardict, charcount = batch.build_dictionary(Xt) n_char = len(chardict.keys()) + 1 batch.save_dictionary(chardict, charcount, '%s/dict.pkl' % save_path) # params params = init_params(n_chars=n_char) labeldict, labelcount = batch.build_label_dictionary(yt) batch.save_dictionary(labeldict, labelcount, '%s/label_dict.pkl' % save_path) n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES) # classification params params['W_cl'] = theano.shared(np.random.normal( loc=0., scale=SCALE, size=(WDIM, n_classes)).astype('float32'), name='W_cl') params['b_cl'] = theano.shared(np.zeros((n_classes)).astype('float32'), name='b_cl') else: print("Loading model params from base path: %s ..." % save_path) params = load_params_shared('%s/model.npz' % save_path) print("Loading dictionaries...") with open('%s/dict.pkl' % save_path, 'rb') as f: chardict = pkl.load(f) with open('%s/label_dict.pkl' % save_path, 'rb') as f: labeldict = pkl.load(f) n_char = len(chardict.keys()) + 1 n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES) # iterators train_iter = batch.BatchTweets(Xt, yt, labeldict, batch_size=N_BATCH, max_classes=MAX_CLASSES) val_iter = batch.BatchTweets(Xv, yv, labeldict, batch_size=N_BATCH, max_classes=MAX_CLASSES, test=True) print("Building network...") # Tweet variables tweet = T.itensor3() targets = T.ivector() # masks t_mask = T.fmatrix() # network for prediction predictions, net, emb = classify(tweet, t_mask, params, n_classes, n_char) # batch loss loss = lasagne.objectives.categorical_crossentropy(predictions, targets) cost = T.mean( loss ) + REGULARIZATION * lasagne.regularization.regularize_network_params( net, lasagne.regularization.l2) cost_only = T.mean(loss) reg_only = REGULARIZATION * lasagne.regularization.regularize_network_params( net, lasagne.regularization.l2) # params and updates print("Computing updates...") lr = LEARNING_RATE mu = MOMENTUM updates = lasagne.updates.nesterov_momentum( cost, lasagne.layers.get_all_params(net), lr, momentum=mu) # Theano function print("Compiling theano functions...") inps = [tweet, t_mask, targets] predict = theano.function([tweet, t_mask], predictions) cost_val = theano.function(inps, [cost_only, emb]) train = theano.function(inps, cost, updates=updates) reg_val = theano.function([], reg_only) # Training print("Training...") uidx = 0 maxp = 0. start = time.time() valcosts = [] try: for epoch in range(num_epochs): n_samples = 0 train_cost = 0. print("Epoch {}".format(epoch)) # learning schedule if len(valcosts) > 1 and SCHEDULE: change = (valcosts[-1] - valcosts[-2]) / abs(valcosts[-2]) if change < T1: lr, mu = schedule(lr, mu) updates = lasagne.updates.nesterov_momentum( cost, lasagne.layers.get_all_params(net), lr, momentum=mu) train = theano.function(inps, cost, updates=updates) T1 = T1 / 2 # stopping criterion if len(valcosts) > 6: deltas = [] for i in range(5): deltas.append((valcosts[-i - 1] - valcosts[-i - 2]) / abs(valcosts[-i - 2])) if sum(deltas) / len(deltas) < T2: break ud_start = time.time() for xr, y in train_iter: n_samples += len(xr) uidx += 1 x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char) if x is None: print("Minibatch with zero samples under maxlength.") uidx -= 1 continue curr_cost = train(x, x_m, y) train_cost += curr_cost * len(xr) ud = time.time() - ud_start if np.isnan(curr_cost) or np.isinf(curr_cost): print("Nan detected.") return if np.mod(uidx, DISPF) == 0: print("Epoch {} Update {} Cost {} Time {}".format( epoch, uidx, curr_cost, ud)) if np.mod(uidx, SAVEF) == 0: print("Saving...") saveparams = OrderedDict() for kk, vv in params.iteritems(): saveparams[kk] = vv.get_value() np.savez('%s/model.npz' % save_path, **saveparams) print("Done.") print("Testing on Validation set...") preds = [] targs = [] for xr, y in val_iter: x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char) if x is None: print( "Validation: Minibatch with zero samples under maxlength." ) continue vp = predict(x, x_m) ranks = np.argsort(vp)[:, ::-1] for idx, item in enumerate(xr): preds.append(ranks[idx, :]) targs.append(y[idx]) validation_cost = precision(np.asarray(preds), targs, 1) regularization_cost = reg_val() if validation_cost > maxp: maxp = validation_cost saveparams = OrderedDict() for kk, vv in params.iteritems(): saveparams[kk] = vv.get_value() np.savez('%s/best_model.npz' % (save_path), **saveparams) print( "Epoch {} Training Cost {} Validation Precision {} Regularization Cost {} Max Precision {}" .format(epoch, train_cost / n_samples, validation_cost, regularization_cost, maxp)) print("Seen {} samples.".format(n_samples)) valcosts.append(validation_cost) print("Saving...") saveparams = OrderedDict() for kk, vv in params.iteritems(): saveparams[kk] = vv.get_value() np.savez('%s/model_%d.npz' % (save_path, epoch), **saveparams) print("Done.") except KeyboardInterrupt: pass print("Total training time = {}".format(time.time() - start))
def main(args): data_path = args[0] model_path = args[1] save_path = args[2] if len(args)>3: m_num = int(args[3]) print("Preparing Data...") # Test data Xt = [] with io.open(data_path,'r',encoding='utf-8') as f: for line in f: Xc = line.rstrip('\n') Xt.append(Xc[:MAX_LENGTH]) # Model print("Loading model params...") if len(args)>3: params = load_params('%s/model_%d.npz' % (model_path,m_num)) else: params = load_params('%s/best_model.npz' % model_path) print("Loading dictionaries...") with open('%s/dict.pkl' % model_path, 'rb') as f: chardict = pkl.load(f) with open('%s/label_dict.pkl' % model_path, 'rb') as f: labeldict = pkl.load(f) n_char = len(chardict.keys()) + 1 n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES) inverse_labeldict = invert(labeldict) print("Building network...") # Tweet variables tweet = T.itensor3() t_mask = T.fmatrix() # network for prediction predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char) # Theano function print("Compiling theano functions...") predict = theano.function([tweet,t_mask],predictions) encode = theano.function([tweet,t_mask],embeddings) # Test print("Encoding...") out_pred = [] out_emb = [] numbatches = len(Xt)/N_BATCH + 1 for i in range(numbatches): xr = Xt[N_BATCH*i:N_BATCH*(i+1)] x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char) p = predict(x,x_m) e = encode(x,x_m) ranks = np.argsort(p)[:,::-1] for idx, item in enumerate(xr): out_pred.append(' '.join([inverse_labeldict[r] for r in ranks[idx,:5]])) out_emb.append(e[idx,:]) # Save print("Saving...") with io.open('%s/predicted_tags.txt'%save_path,'w') as f: for item in out_pred: f.write(item + '\n') with open('%s/embeddings.npy'%save_path,'w') as f: np.save(f,np.asarray(out_emb))
def main(train_path,val_path,save_path,num_epochs=NUM_EPOCHS): global T1 # save settings shutil.copyfile('settings_char.py','%s/settings_char.txt'%save_path) print("Preparing Data...") # Training data Xt = [] yt = [] with io.open(train_path,'r',encoding='utf-8') as f: for line in f: (yc, Xc) = line.rstrip('\n').split('\t') Xt.append(Xc[:MAX_LENGTH]) yt.append(yc) # Validation data Xv = [] yv = [] with io.open(val_path,'r',encoding='utf-8') as f: for line in f: (yc, Xc) = line.rstrip('\n').split('\t') Xv.append(Xc[:MAX_LENGTH]) yv.append(yc.split(',')) print("Building Model...") if not RELOAD_MODEL: # Build dictionaries from training data chardict, charcount = batch.build_dictionary(Xt) n_char = len(chardict.keys()) + 1 batch.save_dictionary(chardict,charcount,'%s/dict.pkl' % save_path) # params params = init_params(n_chars=n_char) labeldict, labelcount = batch.build_label_dictionary(yt) batch.save_dictionary(labeldict, labelcount, '%s/label_dict.pkl' % save_path) n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES) # classification params params['W_cl'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(WDIM,n_classes)).astype('float32'), name='W_cl') params['b_cl'] = theano.shared(np.zeros((n_classes)).astype('float32'), name='b_cl') else: print("Loading model params...") params = load_params_shared('%s/model.npz' % save_path) print("Loading dictionaries...") with open('%s/dict.pkl' % save_path, 'rb') as f: chardict = pkl.load(f) with open('%s/label_dict.pkl' % save_path, 'rb') as f: labeldict = pkl.load(f) n_char = len(chardict.keys()) + 1 n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES) # iterators train_iter = batch.BatchTweets(Xt, yt, labeldict, batch_size=N_BATCH, max_classes=MAX_CLASSES) val_iter = batch.BatchTweets(Xv, yv, labeldict, batch_size=N_BATCH, max_classes=MAX_CLASSES, test=True) print("Building network...") # Tweet variables tweet = T.itensor3() targets = T.ivector() # masks t_mask = T.fmatrix() # network for prediction predictions, net, emb = classify(tweet, t_mask, params, n_classes, n_char) # batch loss loss = lasagne.objectives.categorical_crossentropy(predictions, targets) cost = T.mean(loss) + REGULARIZATION*lasagne.regularization.regularize_network_params(net, lasagne.regularization.l2) cost_only = T.mean(loss) reg_only = REGULARIZATION*lasagne.regularization.regularize_network_params(net, lasagne.regularization.l2) # params and updates print("Computing updates...") lr = LEARNING_RATE mu = MOMENTUM updates = lasagne.updates.nesterov_momentum(cost, lasagne.layers.get_all_params(net), lr, momentum=mu) # Theano function print("Compiling theano functions...") inps = [tweet,t_mask,targets] predict = theano.function([tweet,t_mask],predictions) cost_val = theano.function(inps,[cost_only,emb]) train = theano.function(inps,cost,updates=updates) reg_val = theano.function([],reg_only) # Training print("Training...") uidx = 0 maxp = 0. start = time.time() valcosts = [] try: for epoch in range(num_epochs): n_samples = 0 train_cost = 0. print("Epoch {}".format(epoch)) # learning schedule if len(valcosts) > 1 and SCHEDULE: change = (valcosts[-1]-valcosts[-2])/abs(valcosts[-2]) if change < T1: lr, mu = schedule(lr, mu) updates = lasagne.updates.nesterov_momentum(cost, lasagne.layers.get_all_params(net), lr, momentum=mu) train = theano.function(inps,cost,updates=updates) T1 = T1/2 # stopping criterion if len(valcosts) > 6: deltas = [] for i in range(5): deltas.append((valcosts[-i-1]-valcosts[-i-2])/abs(valcosts[-i-2])) if sum(deltas)/len(deltas) < T2: break ud_start = time.time() for xr,y in train_iter: n_samples +=len(xr) uidx += 1 x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char) if x==None: print("Minibatch with zero samples under maxlength.") uidx -= 1 continue curr_cost = train(x,x_m,y) train_cost += curr_cost*len(xr) ud = time.time() - ud_start if np.isnan(curr_cost) or np.isinf(curr_cost): print("Nan detected.") return if np.mod(uidx, DISPF) == 0: print("Epoch {} Update {} Cost {} Time {}".format(epoch,uidx,curr_cost,ud)) if np.mod(uidx,SAVEF) == 0: print("Saving...") saveparams = OrderedDict() for kk,vv in params.iteritems(): saveparams[kk] = vv.get_value() np.savez('%s/model.npz' % save_path,**saveparams) print("Done.") print("Testing on Validation set...") preds = [] targs = [] for xr,y in val_iter: x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char) if x==None: print("Validation: Minibatch with zero samples under maxlength.") continue vp = predict(x,x_m) ranks = np.argsort(vp)[:,::-1] for idx,item in enumerate(xr): preds.append(ranks[idx,:]) targs.append(y[idx]) validation_cost = precision(np.asarray(preds),targs,1) regularization_cost = reg_val() if validation_cost > maxp: maxp = validation_cost saveparams = OrderedDict() for kk,vv in params.iteritems(): saveparams[kk] = vv.get_value() np.savez('%s/best_model.npz' % (save_path),**saveparams) print("Epoch {} Training Cost {} Validation Precision {} Regularization Cost {} Max Precision {}".format(epoch, train_cost/n_samples, validation_cost, regularization_cost, maxp)) print("Seen {} samples.".format(n_samples)) valcosts.append(validation_cost) print("Saving...") saveparams = OrderedDict() for kk,vv in params.iteritems(): saveparams[kk] = vv.get_value() np.savez('%s/model_%d.npz' % (save_path,epoch),**saveparams) print("Done.") except KeyboardInterrupt: pass print("Total training time = {}".format(time.time()-start))
def main(args): data_path = args[0] model_path = args[1] save_path = args[2] if len(args)>3: m_num = int(args[3]) print("Preparing Data...") # Test data Xt = [] yt = [] with io.open(data_path,'r',encoding='utf-8') as f: for line in f: (yc, Xc) = line.rstrip('\n').split('\t') Xt.append(Xc[:MAX_LENGTH]) yt.append(yc.split(',')) # Model print("Loading model params...") if len(args)>3: params = load_params('%s/model_%d.npz' % (model_path,m_num)) else: params = load_params('%s/best_model.npz' % model_path) print("Loading dictionaries...") with open('%s/dict.pkl' % model_path, 'rb') as f: chardict = pkl.load(f) with open('%s/label_dict.pkl' % model_path, 'rb') as f: labeldict = pkl.load(f) n_char = len(chardict.keys()) + 1 n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES) # iterators test_iter = batch.BatchTweets(Xt, yt, labeldict, batch_size=N_BATCH, max_classes=MAX_CLASSES, test=True) print("Building network...") # Tweet variables tweet = T.itensor3() targets = T.imatrix() # masks t_mask = T.fmatrix() # network for prediction predictions = classify(tweet, t_mask, params, n_classes, n_char)[0] embeddings = classify(tweet, t_mask, params, n_classes, n_char)[1] # Theano function print("Compiling theano functions...") predict = theano.function([tweet,t_mask],predictions) encode = theano.function([tweet,t_mask],embeddings) # Test print("Testing...") out_data = [] out_pred = [] out_emb = [] out_target = [] for xr,y in test_iter: x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char) p = predict(x,x_m) e = encode(x,x_m) ranks = np.argsort(p)[:,::-1] for idx, item in enumerate(xr): out_data.append(item) out_pred.append(ranks[idx,:]) out_emb.append(e[idx,:]) out_target.append(y[idx]) # Save print("Saving...") with open('%s/data.pkl'%save_path,'w') as f: pkl.dump(out_data,f) with open('%s/predictions.npy'%save_path,'w') as f: np.save(f,np.asarray(out_pred)) with open('%s/embeddings.npy'%save_path,'w') as f: np.save(f,np.asarray(out_emb)) with open('%s/targets.pkl'%save_path,'w') as f: pkl.dump(out_target,f)
def main(args): data_path = args[0] model_path = args[1] save_path = args[2] if len(args)>3: m_num = int(args[3]) print("Preparing Data...") # Test data Xt = [] with io.open(data_path,'r',encoding='utf-8') as f: for line in f: Xc = line.rstrip('\n') Xt.append(Xc[:MAX_LENGTH]) # Model print("Loading model params...") if len(args)>3: params = load_params('%s/model_%d.npz' % (model_path,m_num)) else: params = load_params('%s/best_model.npz' % model_path) print("Loading dictionaries...") with open('%s/dict.pkl' % model_path, 'rb') as f: chardict = pkl.load(f) with open('%s/label_dict.pkl' % model_path, 'rb') as f: labeldict = pkl.load(f) n_char = len(chardict.keys()) + 1 n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES) inverse_labeldict = invert(labeldict) print("Building network...") # Tweet variables tweet = T.itensor3() t_mask = T.fmatrix() # network for prediction predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char) # Theano function print("Compiling theano functions...") # predict = theano.function([tweet,t_mask],predictions) encode = theano.function([tweet,t_mask],embeddings) # Test print("Encoding...") out_pred = [] out_emb = [] numbatches = len(Xt)/N_BATCH + 1 print("Num Batches: "+str(numbatches)) for i in range(int(numbatches)): print("processing batch "+str(i)) xr = Xt[N_BATCH*i:N_BATCH*(i+1)] x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char) # p = predict(x,x_m) e = encode(x,x_m) # ranks = np.argsort(p)[:,::-1] for idx, item in enumerate(xr): # out_pred.append(' '.join([inverse_labeldict[r] if r in inverse_labeldict else 'UNK' for r in ranks[idx,:5]])) out_emb.append(e[idx,:]) print("saving") stansFileName = save_path+"/embeddings_w266_"+str(i)+".npy" stansNPArr = np.asarray(out_emb) np.save(stansFileName,stansNPArr) out_emb = [] print("DONE")
def annotate_s2s_text(): """ テキストファイルにタグを付与する関数 :return: """ # path model_path = MODEL_PATH text_path = TEST_INPUT save_path = SAVE_PATH # seq2seq用regex pattern = "(.+?)(\n|\r\n)" r = re.compile(pattern) print("Loading model params...") params = load_params('%s/best_model.npz' % model_path) print("Loading dictionaries...") with open('%s/dict.pkl' % model_path, 'rb') as f: chardict = pkl.load(f) with open('%s/label_dict.pkl' % model_path, 'rb') as f: labeldict = pkl.load(f) n_char = len(chardict.keys()) + 1 n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES) inverse_labeldict = invert(labeldict) print("Building network...") tweet = T.itensor3() t_mask = T.fmatrix() predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char) print("Compiling theano functions...") predict = theano.function([tweet, t_mask], predictions) # Encoding cmnts cmnts = [] Xt = [] for line in io.open(text_path, 'r', encoding='utf-8'): m = r.search(line) if m is not None: cmnts.append(m.group(1)) Xc_cmnt = m.group(1).replace(' ', '') # 半角スペースの除去(tweet2vecに入力するため) Xt.append(Xc_cmnt[:MAX_LENGTH]) out_pred = [] numbatches = len(Xt) / N_BATCH + 1 print 'number of batches', numbatches for i in range(numbatches): xr = Xt[N_BATCH * i:N_BATCH * (i + 1)] x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char) p = predict(x, x_m) ranks = np.argsort(p)[:, ::-1] for idx, item in enumerate(xr): out_pred.append([ inverse_labeldict[r] if r in inverse_labeldict else 'UNK' for r in ranks[idx, :5] ][0]) print i, 'batches end...' # Save result with io.open(save_path, 'w') as f: for tag, cmnt in zip(out_pred, cmnts): f.write(tag + '\t' + cmnt + '\n')
def generate_embeddings(args): data_path = args[0] model_path = args[1] # save_path = args[2] if len(args) > 2: m_num = int(args[2]) print("Preparing Data...") # Test data # read tweet texts into an array Xt = [] # read from file with io.open(data_path, 'r', encoding='utf-8') as f: for line in f: Xc = line.rstrip('\n') Xt.append(Xc[:MAX_LENGTH]) print "Tweets:", len(Xt) print "Unique tweets:", len(set(Xt)) # Model print("Loading model params...") if len(args) > 3: params = load_params('%s/model_%d.npz' % (model_path, m_num)) else: params = load_params('%s/best_model.npz' % model_path) print("Loading dictionaries...") with open('%s/dict.pkl' % model_path, 'rb') as f: chardict = pkl.load(f) with open('%s/label_dict.pkl' % model_path, 'rb') as f: labeldict = pkl.load(f) n_char = len(chardict.keys()) + 1 n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES) inverse_labeldict = invert(labeldict) print("Building network...") # Tweet variables tweet = T.itensor3() t_mask = T.fmatrix() # network for prediction predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char) # Theano function print("Compiling theano functions...") predict = theano.function([tweet, t_mask], predictions) encode = theano.function([tweet, t_mask], embeddings) # Test print("Encoding...") out_data = [] out_pred = [] out_emb = [] numbatches = len(Xt) / N_BATCH + 1 for i in range(numbatches): xr = Xt[N_BATCH * i:N_BATCH * (i + 1)] x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char) p = predict(x, x_m) e = encode(x, x_m) ranks = np.argsort(p)[:, ::-1] for idx, item in enumerate(xr): out_data.append(item) # print [r for r in ranks[idx,:5]] # out_pred.append(' '.join([inverse_labeldict[r] for r in ranks[idx,:5]])) out_pred.append(ranks[idx, :]) out_emb.append(e[idx, :]) return out_emb
def main(args): data_path = args[0] model_path = args[1] save_path = args[2] if len(args) > 3: m_num = int(args[3]) print("Preparing Data...") # Test data Xt = [] with io.open(data_path, 'r', encoding='utf-8') as f: for line in f: # JK/QUARTZ # here taking the string only up to the tab character we added # to every trump tweet line. Was: # Xc = line.rstrip('\n') Xc = re.match(r'^(.*)\t', line).group(1).rstrip('\n') print(Xc) Xt.append(Xc[:MAX_LENGTH]) # Model print("Loading model params...") if len(args) > 3: params = load_params('%s/model_%d.npz' % (model_path, m_num)) else: params = load_params('%s/best_model.npz' % model_path) print("Loading dictionaries...") with open('%s/dict.pkl' % model_path, 'rb') as f: chardict = pkl.load(f) with open('%s/label_dict.pkl' % model_path, 'rb') as f: labeldict = pkl.load(f) n_char = len(chardict.keys()) + 1 n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES) inverse_labeldict = invert(labeldict) print("Building network...") # Tweet variables tweet = T.itensor3() t_mask = T.fmatrix() # network for prediction predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char) # Theano function print("Compiling theano functions...") # JK/QUARTZ Disabling the prediction function, because we just need the vectoring # predict = theano.function([tweet,t_mask],predictions) encode = theano.function([tweet, t_mask], embeddings) # Test print("Encoding...") # JK/QUARTZ Disabling the prediction lines, because we just need the vectoring # out_pred = [] out_emb = [] numbatches = len(Xt) / N_BATCH + 1 for i in range(numbatches): xr = Xt[N_BATCH * i:N_BATCH * (i + 1)] x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char) # JK/QUARTZ Disabling the prediction function, because we just need the vectoring # p = predict(x,x_m) e = encode(x, x_m) # JK/QUARTZ Disabling the prediction lines, because we just need the vectoring # ranks = np.argsort(p)[:,::-1] for idx, item in enumerate(xr): # JK/QUARTZ Disabling the prediction lines, because we just need the vectoring # out_pred.append(' '.join([inverse_labeldict[r] if r in inverse_labeldict else 'UNK' for r in ranks[idx,:5]])) out_emb.append(e[idx, :]) # Save print("Saving...") # JK/QUARTZ Disabling the prediction lines, because we just need the vectoring # with io.open('%s/predicted_tags.txt'%save_path,'w') as f: # for item in out_pred: # f.write(item + '\n') with open('%s/embeddings.npy' % save_path, 'w') as f: np.save(f, np.asarray(out_emb))
def main(args): data_path = args[0] model_path = args[1] save_path = args[2] if len(args) > 3: m_num = int(args[3]) print("Preparing Data...") # Test data Xt = [] yt = [] with io.open(data_path, 'r', encoding='utf-8') as f: for line in f: (yc, Xc) = line.rstrip('\n').split('\t') Xt.append(Xc[:MAX_LENGTH]) yt.append(yc.split(',')) # Model print("Loading model params...") if len(args) > 3: params = load_params('%s/model_%d.npz' % (model_path, m_num)) else: params = load_params('%s/best_model.npz' % model_path) print("Loading dictionaries...") with open('%s/dict.pkl' % model_path, 'rb') as f: chardict = pkl.load(f) with open('%s/label_dict.pkl' % model_path, 'rb') as f: labeldict = pkl.load(f) n_char = len(chardict.keys()) + 1 n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES) # iterators test_iter = batch.BatchTweets(Xt, yt, labeldict, batch_size=N_BATCH, max_classes=MAX_CLASSES, test=True) print("Building network...") # Tweet variables tweet = T.itensor3() targets = T.imatrix() # masks t_mask = T.fmatrix() # network for prediction predictions = classify(tweet, t_mask, params, n_classes, n_char)[0] embeddings = classify(tweet, t_mask, params, n_classes, n_char)[1] # Theano function print("Compiling theano functions...") predict = theano.function([tweet, t_mask], predictions) encode = theano.function([tweet, t_mask], embeddings) # Test print("Testing...") out_data = [] out_pred = [] out_emb = [] out_target = [] for xr, y in test_iter: x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char) p = predict(x, x_m) e = encode(x, x_m) ranks = np.argsort(p)[:, ::-1] for idx, item in enumerate(xr): out_data.append(item) out_pred.append(ranks[idx, :]) out_emb.append(e[idx, :]) out_target.append(y[idx]) # Save print("Saving...") with open('%s/data.pkl' % save_path, 'w') as f: pkl.dump(out_data, f) with open('%s/predictions.npy' % save_path, 'w') as f: np.save(f, np.asarray(out_pred)) with open('%s/embeddings.npy' % save_path, 'w') as f: np.save(f, np.asarray(out_emb)) with open('%s/targets.pkl' % save_path, 'w') as f: pkl.dump(out_target, f)