def main(args): data_path = args[0] model_path = args[1] save_path = args[2] if len(args)>3: m_num = int(args[3]) print("Preparing Data...") # Test data Xt = [] with io.open(data_path,'r',encoding='utf-8') as f: # for line in f: # Xc = line.rstrip('\n') # Xt.append(Xc[:MAX_LENGTH]) ###Input dataset with tweet+emoji instead of just tweets data = csv.reader(f, delimiter=',', quotechar="|") for line in data: Xc = line[0].rstrip('\n') Xt.append(Xc[:MAX_LENGTH]) # Model print("Loading model params...") if len(args)>3: params = load_params('%s/model_%d.npz' % (model_path,m_num)) else: params = load_params('%s/best_model.npz' % model_path) print("Loading dictionaries...") with open('%s/dict.pkl' % model_path, 'rb') as f: chardict = pkl.load(f) with open('%s/label_dict.pkl' % model_path, 'rb') as f: labeldict = pkl.load(f) n_char = len(chardict.keys()) + 1 n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES) inverse_labeldict = invert(labeldict) print("Building network...") # Tweet variables tweet = T.itensor3() t_mask = T.fmatrix() # network for prediction predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char) # Theano function print("Compiling theano functions...") predict = theano.function([tweet,t_mask],predictions) encode = theano.function([tweet,t_mask],embeddings) # Test print("Encoding...") out_pred = [] out_emb = [] numbatches = len(Xt)/N_BATCH + 1 for i in range(numbatches): xr = Xt[N_BATCH*i:N_BATCH*(i+1)] x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char) p = predict(x,x_m) e = encode(x,x_m) ranks = np.argsort(p)[:,::-1] for idx, item in enumerate(xr): out_pred.append(' '.join([inverse_labeldict[r] if r in inverse_labeldict else 'UNK' for r in ranks[idx,:5]])) out_emb.append(e[idx,:]) # Save print("Saving...") ###Author - jagathshree if not os.path.exists(save_path): os.makedirs(save_path) with io.open('%s/predicted_tags.txt'%save_path,'w') as f: for item in out_pred: f.write(item + '\n') with open('%s/embeddings.npy'%save_path,'w') as f: np.save(f,np.asarray(out_emb))
def main(args): data_path = args[0] model_path = args[1] save_path = args[2] if len(args)>3: m_num = int(args[3]) print("Preparing Data...") # Test data Xt = [] yt = [] with io.open(data_path,'r',encoding='utf-8') as f: for line in f: (yc, Xc) = line.rstrip('\n').split('\t') Xt.append(Xc[:MAX_LENGTH]) yt.append(yc.split(',')) # Model print("Loading model params...") if len(args)>3: params = load_params('%s/model_%d.npz' % (model_path,m_num)) else: params = load_params('%s/best_model.npz' % model_path) print("Loading dictionaries...") with open('%s/dict.pkl' % model_path, 'rb') as f: chardict = pkl.load(f) with open('%s/label_dict.pkl' % model_path, 'rb') as f: labeldict = pkl.load(f) n_char = len(chardict.keys()) + 1 n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES) # iterators test_iter = batch.BatchTweets(Xt, yt, labeldict, batch_size=N_BATCH, max_classes=MAX_CLASSES, test=True) print("Building network...") # Tweet variables tweet = T.itensor3() targets = T.imatrix() # masks t_mask = T.fmatrix() # network for prediction predictions = classify(tweet, t_mask, params, n_classes, n_char)[0] embeddings = classify(tweet, t_mask, params, n_classes, n_char)[1] # Theano function print("Compiling theano functions...") predict = theano.function([tweet,t_mask],predictions) encode = theano.function([tweet,t_mask],embeddings) # Test print("Testing...") out_data = [] out_pred = [] out_emb = [] out_target = [] for xr,y in test_iter: x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char) p = predict(x,x_m) e = encode(x,x_m) ranks = np.argsort(p)[:,::-1] for idx, item in enumerate(xr): out_data.append(item) out_pred.append(ranks[idx,:]) out_emb.append(e[idx,:]) out_target.append(y[idx]) # Save print("Saving...") with open('%s/data.pkl'%save_path,'w') as f: pkl.dump(out_data,f) with open('%s/predictions.npy'%save_path,'w') as f: np.save(f,np.asarray(out_pred)) with open('%s/embeddings.npy'%save_path,'w') as f: np.save(f,np.asarray(out_emb)) with open('%s/targets.pkl'%save_path,'w') as f: pkl.dump(out_target,f)
def main(args): data_path = args[0] model_path = args[1] save_path = args[2] if len(args)>3: m_num = int(args[3]) print("Preparing Data...") # Test data Xt = [] with io.open(data_path,'r',encoding='utf-8') as f: for line in f: Xc = line.rstrip('\n') Xt.append(Xc[:MAX_LENGTH]) # Model print("Loading model params...") if len(args)>3: params = load_params('%s/model_%d.npz' % (model_path,m_num)) else: params = load_params('%s/best_model.npz' % model_path) print("Loading dictionaries...") with open('%s/dict.pkl' % model_path, 'rb') as f: chardict = pkl.load(f) with open('%s/label_dict.pkl' % model_path, 'rb') as f: labeldict = pkl.load(f) n_char = len(chardict.keys()) + 1 n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES) inverse_labeldict = invert(labeldict) print("Building network...") # Tweet variables tweet = T.itensor3() t_mask = T.fmatrix() # network for prediction predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char) # Theano function print("Compiling theano functions...") predict = theano.function([tweet,t_mask],predictions) encode = theano.function([tweet,t_mask],embeddings) # Test print("Encoding...") out_pred = [] out_emb = [] numbatches = len(Xt)/N_BATCH + 1 for i in range(numbatches): xr = Xt[N_BATCH*i:N_BATCH*(i+1)] x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char) p = predict(x,x_m) e = encode(x,x_m) ranks = np.argsort(p)[:,::-1] for idx, item in enumerate(xr): out_pred.append(' '.join([inverse_labeldict[r] for r in ranks[idx,:5]])) out_emb.append(e[idx,:]) # Save print("Saving...") with io.open('%s/predicted_tags.txt'%save_path,'w') as f: for item in out_pred: f.write(item + '\n') with open('%s/embeddings.npy'%save_path,'w') as f: np.save(f,np.asarray(out_emb))
def main(args): data_path = args[0] model_path = args[1] save_path = args[2] if len(args)>3: m_num = int(args[3]) print("Preparing Data...") # Test data Xt = [] with io.open(data_path,'r',encoding='utf-8') as f: for line in f: Xc = line.rstrip('\n') Xt.append(Xc[:MAX_LENGTH]) # Model print("Loading model params...") if len(args)>3: params = load_params('%s/model_%d.npz' % (model_path,m_num)) else: params = load_params('%s/best_model.npz' % model_path) print("Loading dictionaries...") with open('%s/dict.pkl' % model_path, 'rb') as f: chardict = pkl.load(f) with open('%s/label_dict.pkl' % model_path, 'rb') as f: labeldict = pkl.load(f) n_char = len(chardict.keys()) + 1 n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES) inverse_labeldict = invert(labeldict) print("Building network...") # Tweet variables tweet = T.itensor3() t_mask = T.fmatrix() # network for prediction predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char) # Theano function print("Compiling theano functions...") # predict = theano.function([tweet,t_mask],predictions) encode = theano.function([tweet,t_mask],embeddings) # Test print("Encoding...") out_pred = [] out_emb = [] numbatches = len(Xt)/N_BATCH + 1 print("Num Batches: "+str(numbatches)) for i in range(int(numbatches)): print("processing batch "+str(i)) xr = Xt[N_BATCH*i:N_BATCH*(i+1)] x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char) # p = predict(x,x_m) e = encode(x,x_m) # ranks = np.argsort(p)[:,::-1] for idx, item in enumerate(xr): # out_pred.append(' '.join([inverse_labeldict[r] if r in inverse_labeldict else 'UNK' for r in ranks[idx,:5]])) out_emb.append(e[idx,:]) print("saving") stansFileName = save_path+"/embeddings_w266_"+str(i)+".npy" stansNPArr = np.asarray(out_emb) np.save(stansFileName,stansNPArr) out_emb = [] print("DONE")
def annotate_s2s_text(): """ テキストファイルにタグを付与する関数 :return: """ # path model_path = MODEL_PATH text_path = TEST_INPUT save_path = SAVE_PATH # seq2seq用regex pattern = "(.+?)(\n|\r\n)" r = re.compile(pattern) print("Loading model params...") params = load_params('%s/best_model.npz' % model_path) print("Loading dictionaries...") with open('%s/dict.pkl' % model_path, 'rb') as f: chardict = pkl.load(f) with open('%s/label_dict.pkl' % model_path, 'rb') as f: labeldict = pkl.load(f) n_char = len(chardict.keys()) + 1 n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES) inverse_labeldict = invert(labeldict) print("Building network...") tweet = T.itensor3() t_mask = T.fmatrix() predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char) print("Compiling theano functions...") predict = theano.function([tweet, t_mask], predictions) # Encoding cmnts cmnts = [] Xt = [] for line in io.open(text_path, 'r', encoding='utf-8'): m = r.search(line) if m is not None: cmnts.append(m.group(1)) Xc_cmnt = m.group(1).replace(' ', '') # 半角スペースの除去(tweet2vecに入力するため) Xt.append(Xc_cmnt[:MAX_LENGTH]) out_pred = [] numbatches = len(Xt) / N_BATCH + 1 print 'number of batches', numbatches for i in range(numbatches): xr = Xt[N_BATCH * i:N_BATCH * (i + 1)] x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char) p = predict(x, x_m) ranks = np.argsort(p)[:, ::-1] for idx, item in enumerate(xr): out_pred.append([ inverse_labeldict[r] if r in inverse_labeldict else 'UNK' for r in ranks[idx, :5] ][0]) print i, 'batches end...' # Save result with io.open(save_path, 'w') as f: for tag, cmnt in zip(out_pred, cmnts): f.write(tag + '\t' + cmnt + '\n')
def generate_embeddings(args): data_path = args[0] model_path = args[1] # save_path = args[2] if len(args) > 2: m_num = int(args[2]) print("Preparing Data...") # Test data # read tweet texts into an array Xt = [] # read from file with io.open(data_path, 'r', encoding='utf-8') as f: for line in f: Xc = line.rstrip('\n') Xt.append(Xc[:MAX_LENGTH]) print "Tweets:", len(Xt) print "Unique tweets:", len(set(Xt)) # Model print("Loading model params...") if len(args) > 3: params = load_params('%s/model_%d.npz' % (model_path, m_num)) else: params = load_params('%s/best_model.npz' % model_path) print("Loading dictionaries...") with open('%s/dict.pkl' % model_path, 'rb') as f: chardict = pkl.load(f) with open('%s/label_dict.pkl' % model_path, 'rb') as f: labeldict = pkl.load(f) n_char = len(chardict.keys()) + 1 n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES) inverse_labeldict = invert(labeldict) print("Building network...") # Tweet variables tweet = T.itensor3() t_mask = T.fmatrix() # network for prediction predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char) # Theano function print("Compiling theano functions...") predict = theano.function([tweet, t_mask], predictions) encode = theano.function([tweet, t_mask], embeddings) # Test print("Encoding...") out_data = [] out_pred = [] out_emb = [] numbatches = len(Xt) / N_BATCH + 1 for i in range(numbatches): xr = Xt[N_BATCH * i:N_BATCH * (i + 1)] x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char) p = predict(x, x_m) e = encode(x, x_m) ranks = np.argsort(p)[:, ::-1] for idx, item in enumerate(xr): out_data.append(item) # print [r for r in ranks[idx,:5]] # out_pred.append(' '.join([inverse_labeldict[r] for r in ranks[idx,:5]])) out_pred.append(ranks[idx, :]) out_emb.append(e[idx, :]) return out_emb
def main(args): data_path = args[0] model_path = args[1] save_path = args[2] if len(args) > 3: m_num = int(args[3]) print("Preparing Data...") # Test data Xt = [] with io.open(data_path, 'r', encoding='utf-8') as f: for line in f: # JK/QUARTZ # here taking the string only up to the tab character we added # to every trump tweet line. Was: # Xc = line.rstrip('\n') Xc = re.match(r'^(.*)\t', line).group(1).rstrip('\n') print(Xc) Xt.append(Xc[:MAX_LENGTH]) # Model print("Loading model params...") if len(args) > 3: params = load_params('%s/model_%d.npz' % (model_path, m_num)) else: params = load_params('%s/best_model.npz' % model_path) print("Loading dictionaries...") with open('%s/dict.pkl' % model_path, 'rb') as f: chardict = pkl.load(f) with open('%s/label_dict.pkl' % model_path, 'rb') as f: labeldict = pkl.load(f) n_char = len(chardict.keys()) + 1 n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES) inverse_labeldict = invert(labeldict) print("Building network...") # Tweet variables tweet = T.itensor3() t_mask = T.fmatrix() # network for prediction predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char) # Theano function print("Compiling theano functions...") # JK/QUARTZ Disabling the prediction function, because we just need the vectoring # predict = theano.function([tweet,t_mask],predictions) encode = theano.function([tweet, t_mask], embeddings) # Test print("Encoding...") # JK/QUARTZ Disabling the prediction lines, because we just need the vectoring # out_pred = [] out_emb = [] numbatches = len(Xt) / N_BATCH + 1 for i in range(numbatches): xr = Xt[N_BATCH * i:N_BATCH * (i + 1)] x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char) # JK/QUARTZ Disabling the prediction function, because we just need the vectoring # p = predict(x,x_m) e = encode(x, x_m) # JK/QUARTZ Disabling the prediction lines, because we just need the vectoring # ranks = np.argsort(p)[:,::-1] for idx, item in enumerate(xr): # JK/QUARTZ Disabling the prediction lines, because we just need the vectoring # out_pred.append(' '.join([inverse_labeldict[r] if r in inverse_labeldict else 'UNK' for r in ranks[idx,:5]])) out_emb.append(e[idx, :]) # Save print("Saving...") # JK/QUARTZ Disabling the prediction lines, because we just need the vectoring # with io.open('%s/predicted_tags.txt'%save_path,'w') as f: # for item in out_pred: # f.write(item + '\n') with open('%s/embeddings.npy' % save_path, 'w') as f: np.save(f, np.asarray(out_emb))
def main(args): data_path = args[0] model_path = args[1] save_path = args[2] if len(args) > 3: m_num = int(args[3]) print("Preparing Data...") # Test data Xt = [] yt = [] with io.open(data_path, 'r', encoding='utf-8') as f: for line in f: (yc, Xc) = line.rstrip('\n').split('\t') Xt.append(Xc[:MAX_LENGTH]) yt.append(yc.split(',')) # Model print("Loading model params...") if len(args) > 3: params = load_params('%s/model_%d.npz' % (model_path, m_num)) else: params = load_params('%s/best_model.npz' % model_path) print("Loading dictionaries...") with open('%s/dict.pkl' % model_path, 'rb') as f: chardict = pkl.load(f) with open('%s/label_dict.pkl' % model_path, 'rb') as f: labeldict = pkl.load(f) n_char = len(chardict.keys()) + 1 n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES) # iterators test_iter = batch.BatchTweets(Xt, yt, labeldict, batch_size=N_BATCH, max_classes=MAX_CLASSES, test=True) print("Building network...") # Tweet variables tweet = T.itensor3() targets = T.imatrix() # masks t_mask = T.fmatrix() # network for prediction predictions = classify(tweet, t_mask, params, n_classes, n_char)[0] embeddings = classify(tweet, t_mask, params, n_classes, n_char)[1] # Theano function print("Compiling theano functions...") predict = theano.function([tweet, t_mask], predictions) encode = theano.function([tweet, t_mask], embeddings) # Test print("Testing...") out_data = [] out_pred = [] out_emb = [] out_target = [] for xr, y in test_iter: x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char) p = predict(x, x_m) e = encode(x, x_m) ranks = np.argsort(p)[:, ::-1] for idx, item in enumerate(xr): out_data.append(item) out_pred.append(ranks[idx, :]) out_emb.append(e[idx, :]) out_target.append(y[idx]) # Save print("Saving...") with open('%s/data.pkl' % save_path, 'w') as f: pkl.dump(out_data, f) with open('%s/predictions.npy' % save_path, 'w') as f: np.save(f, np.asarray(out_pred)) with open('%s/embeddings.npy' % save_path, 'w') as f: np.save(f, np.asarray(out_emb)) with open('%s/targets.pkl' % save_path, 'w') as f: pkl.dump(out_target, f)