コード例 #1
0
def main(args):
    data_path = args[0]
    model_path = args[1]
    save_path = args[2]
    if len(args)>3:
        m_num = int(args[3])

    print("Preparing Data...")
    # Test data
    Xt = []
    with io.open(data_path,'r',encoding='utf-8') as f:
        # for line in f:
        #     Xc = line.rstrip('\n')
        #     Xt.append(Xc[:MAX_LENGTH])
        ###Input dataset with tweet+emoji instead of just tweets
        data = csv.reader(f, delimiter=',', quotechar="|")
        for line in data:
            Xc = line[0].rstrip('\n')
            Xt.append(Xc[:MAX_LENGTH])
    # Model
    print("Loading model params...")
    if len(args)>3:
        params = load_params('%s/model_%d.npz' % (model_path,m_num))
    else:
        params = load_params('%s/best_model.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = len(chardict.keys()) + 1
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)
    inverse_labeldict = invert(labeldict)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    t_mask = T.fmatrix()

    # network for prediction
    predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char)

    # Theano function
    print("Compiling theano functions...")
    predict = theano.function([tweet,t_mask],predictions)
    encode = theano.function([tweet,t_mask],embeddings)

    # Test
    print("Encoding...")
    out_pred = []
    out_emb = []
    numbatches = len(Xt)/N_BATCH + 1
    for i in range(numbatches):
        xr = Xt[N_BATCH*i:N_BATCH*(i+1)]
        x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char)
        p = predict(x,x_m)
        e = encode(x,x_m)
        ranks = np.argsort(p)[:,::-1]

        for idx, item in enumerate(xr):
            out_pred.append(' '.join([inverse_labeldict[r] if r in inverse_labeldict else 'UNK' for r in ranks[idx,:5]]))
            out_emb.append(e[idx,:])

    # Save
    print("Saving...")
    ###Author - jagathshree
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    with io.open('%s/predicted_tags.txt'%save_path,'w') as f:
        for item in out_pred:
            f.write(item + '\n')
    with open('%s/embeddings.npy'%save_path,'w') as f:
        np.save(f,np.asarray(out_emb))
コード例 #2
0
def main(train_path, val_path, save_path, num_epochs=50):
    global T1

    print('NUM EPOCHS %i' % num_epochs)

    # save settings
    shutil.copyfile('settings_char.py', '%s/settings_char.txt' % save_path)

    print("Preparing Data 123")
    # Training data
    Xt = []
    yt = []
    with io.open(train_path, 'r', encoding='utf-8') as f:
        for line in f:
            (yc, Xc) = line.rstrip('\n').split('\t')
            Xt.append(Xc[:MAX_LENGTH])
            yt.append(yc)
    # Validation data
    Xv = []
    yv = []
    with io.open(val_path, 'r', encoding='utf-8') as f:
        for line in f:
            (yc, Xc) = line.rstrip('\n').split('\t')
            Xv.append(Xc[:MAX_LENGTH])
            yv.append(yc.split(','))

    print("Building Model...")
    if not RELOAD_MODEL:
        # Build dictionaries from training data
        chardict, charcount = batch.build_dictionary(Xt)
        n_char = len(chardict.keys()) + 1
        batch.save_dictionary(chardict, charcount, '%s/dict.pkl' % save_path)
        # params
        params = init_params(n_chars=n_char)

        labeldict, labelcount = batch.build_label_dictionary(yt)
        batch.save_dictionary(labeldict, labelcount,
                              '%s/label_dict.pkl' % save_path)

        n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)

        # classification params
        params['W_cl'] = theano.shared(np.random.normal(
            loc=0., scale=SCALE, size=(WDIM, n_classes)).astype('float32'),
                                       name='W_cl')
        params['b_cl'] = theano.shared(np.zeros((n_classes)).astype('float32'),
                                       name='b_cl')

    else:
        print("Loading model params from base path: %s ..." % save_path)
        params = load_params_shared('%s/model.npz' % save_path)

        print("Loading dictionaries...")
        with open('%s/dict.pkl' % save_path, 'rb') as f:
            chardict = pkl.load(f)
        with open('%s/label_dict.pkl' % save_path, 'rb') as f:
            labeldict = pkl.load(f)
        n_char = len(chardict.keys()) + 1
        n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)

    # iterators
    train_iter = batch.BatchTweets(Xt,
                                   yt,
                                   labeldict,
                                   batch_size=N_BATCH,
                                   max_classes=MAX_CLASSES)
    val_iter = batch.BatchTweets(Xv,
                                 yv,
                                 labeldict,
                                 batch_size=N_BATCH,
                                 max_classes=MAX_CLASSES,
                                 test=True)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    targets = T.ivector()
    # masks
    t_mask = T.fmatrix()

    # network for prediction
    predictions, net, emb = classify(tweet, t_mask, params, n_classes, n_char)

    # batch loss
    loss = lasagne.objectives.categorical_crossentropy(predictions, targets)
    cost = T.mean(
        loss
    ) + REGULARIZATION * lasagne.regularization.regularize_network_params(
        net, lasagne.regularization.l2)
    cost_only = T.mean(loss)
    reg_only = REGULARIZATION * lasagne.regularization.regularize_network_params(
        net, lasagne.regularization.l2)

    # params and updates
    print("Computing updates...")
    lr = LEARNING_RATE
    mu = MOMENTUM
    updates = lasagne.updates.nesterov_momentum(
        cost, lasagne.layers.get_all_params(net), lr, momentum=mu)

    # Theano function
    print("Compiling theano functions...")
    inps = [tweet, t_mask, targets]
    predict = theano.function([tweet, t_mask], predictions)
    cost_val = theano.function(inps, [cost_only, emb])
    train = theano.function(inps, cost, updates=updates)
    reg_val = theano.function([], reg_only)

    # Training
    print("Training...")
    uidx = 0
    maxp = 0.
    start = time.time()
    valcosts = []
    try:
        for epoch in range(num_epochs):
            n_samples = 0
            train_cost = 0.
            print("Epoch {}".format(epoch))

            # learning schedule
            if len(valcosts) > 1 and SCHEDULE:
                change = (valcosts[-1] - valcosts[-2]) / abs(valcosts[-2])
                if change < T1:
                    lr, mu = schedule(lr, mu)
                    updates = lasagne.updates.nesterov_momentum(
                        cost,
                        lasagne.layers.get_all_params(net),
                        lr,
                        momentum=mu)
                    train = theano.function(inps, cost, updates=updates)
                    T1 = T1 / 2

    # stopping criterion
            if len(valcosts) > 6:
                deltas = []
                for i in range(5):
                    deltas.append((valcosts[-i - 1] - valcosts[-i - 2]) /
                                  abs(valcosts[-i - 2]))
                if sum(deltas) / len(deltas) < T2:
                    break

            ud_start = time.time()
            for xr, y in train_iter:
                n_samples += len(xr)
                uidx += 1
                x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char)
                if x is None:
                    print("Minibatch with zero samples under maxlength.")
                    uidx -= 1
                    continue

                curr_cost = train(x, x_m, y)
                train_cost += curr_cost * len(xr)
                ud = time.time() - ud_start

                if np.isnan(curr_cost) or np.isinf(curr_cost):
                    print("Nan detected.")
                    return

                if np.mod(uidx, DISPF) == 0:
                    print("Epoch {} Update {} Cost {} Time {}".format(
                        epoch, uidx, curr_cost, ud))

                if np.mod(uidx, SAVEF) == 0:
                    print("Saving...")
                    saveparams = OrderedDict()
                    for kk, vv in params.iteritems():
                        saveparams[kk] = vv.get_value()
                    np.savez('%s/model.npz' % save_path, **saveparams)
                    print("Done.")

            print("Testing on Validation set...")
            preds = []
            targs = []
            for xr, y in val_iter:
                x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char)
                if x is None:
                    print(
                        "Validation: Minibatch with zero samples under maxlength."
                    )
                    continue

                vp = predict(x, x_m)
                ranks = np.argsort(vp)[:, ::-1]
                for idx, item in enumerate(xr):
                    preds.append(ranks[idx, :])
                    targs.append(y[idx])

            validation_cost = precision(np.asarray(preds), targs, 1)
            regularization_cost = reg_val()

            if validation_cost > maxp:
                maxp = validation_cost
                saveparams = OrderedDict()
                for kk, vv in params.iteritems():
                    saveparams[kk] = vv.get_value()
                np.savez('%s/best_model.npz' % (save_path), **saveparams)

            print(
                "Epoch {} Training Cost {} Validation Precision {} Regularization Cost {} Max Precision {}"
                .format(epoch, train_cost / n_samples, validation_cost,
                        regularization_cost, maxp))
            print("Seen {} samples.".format(n_samples))
            valcosts.append(validation_cost)

            print("Saving...")
            saveparams = OrderedDict()
            for kk, vv in params.iteritems():
                saveparams[kk] = vv.get_value()
            np.savez('%s/model_%d.npz' % (save_path, epoch), **saveparams)
            print("Done.")

    except KeyboardInterrupt:
        pass
    print("Total training time = {}".format(time.time() - start))
コード例 #3
0
ファイル: encode_char.py プロジェクト: MorLong/tweet2vec
def main(args):

    data_path = args[0]
    model_path = args[1]
    save_path = args[2]
    if len(args)>3:
        m_num = int(args[3])

    print("Preparing Data...")
    # Test data
    Xt = []
    with io.open(data_path,'r',encoding='utf-8') as f:
        for line in f:
            Xc = line.rstrip('\n')
            Xt.append(Xc[:MAX_LENGTH])

    # Model
    print("Loading model params...")
    if len(args)>3:
        params = load_params('%s/model_%d.npz' % (model_path,m_num))
    else:
        params = load_params('%s/best_model.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = len(chardict.keys()) + 1
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)
    inverse_labeldict = invert(labeldict)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    t_mask = T.fmatrix()

    # network for prediction
    predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char)

    # Theano function
    print("Compiling theano functions...")
    predict = theano.function([tweet,t_mask],predictions)
    encode = theano.function([tweet,t_mask],embeddings)

    # Test
    print("Encoding...")
    out_pred = []
    out_emb = []
    numbatches = len(Xt)/N_BATCH + 1
    for i in range(numbatches):
        xr = Xt[N_BATCH*i:N_BATCH*(i+1)]
        x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char)
        p = predict(x,x_m)
        e = encode(x,x_m)
        ranks = np.argsort(p)[:,::-1]

        for idx, item in enumerate(xr):
            out_pred.append(' '.join([inverse_labeldict[r] for r in ranks[idx,:5]]))
            out_emb.append(e[idx,:])

    # Save
    print("Saving...")
    with io.open('%s/predicted_tags.txt'%save_path,'w') as f:
        for item in out_pred:
            f.write(item + '\n')
    with open('%s/embeddings.npy'%save_path,'w') as f:
        np.save(f,np.asarray(out_emb))
コード例 #4
0
ファイル: char.py プロジェクト: MorLong/tweet2vec
def main(train_path,val_path,save_path,num_epochs=NUM_EPOCHS):
    global T1

    # save settings
    shutil.copyfile('settings_char.py','%s/settings_char.txt'%save_path)

    print("Preparing Data...")
    # Training data
    Xt = []
    yt = []
    with io.open(train_path,'r',encoding='utf-8') as f:
        for line in f:
            (yc, Xc) = line.rstrip('\n').split('\t')
            Xt.append(Xc[:MAX_LENGTH])
            yt.append(yc)
    # Validation data
    Xv = []
    yv = []
    with io.open(val_path,'r',encoding='utf-8') as f:
        for line in f:
            (yc, Xc) = line.rstrip('\n').split('\t')
            Xv.append(Xc[:MAX_LENGTH])
            yv.append(yc.split(','))

    print("Building Model...")
    if not RELOAD_MODEL:
        # Build dictionaries from training data
        chardict, charcount = batch.build_dictionary(Xt)
        n_char = len(chardict.keys()) + 1
        batch.save_dictionary(chardict,charcount,'%s/dict.pkl' % save_path)
        # params
        params = init_params(n_chars=n_char)
        
        labeldict, labelcount = batch.build_label_dictionary(yt)
        batch.save_dictionary(labeldict, labelcount, '%s/label_dict.pkl' % save_path)

        n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)

        # classification params
        params['W_cl'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(WDIM,n_classes)).astype('float32'), name='W_cl')
        params['b_cl'] = theano.shared(np.zeros((n_classes)).astype('float32'), name='b_cl')

    else:
        print("Loading model params...")
        params = load_params_shared('%s/model.npz' % save_path)

        print("Loading dictionaries...")
        with open('%s/dict.pkl' % save_path, 'rb') as f:
            chardict = pkl.load(f)
        with open('%s/label_dict.pkl' % save_path, 'rb') as f:
            labeldict = pkl.load(f)
        n_char = len(chardict.keys()) + 1
        n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)

    # iterators
    train_iter = batch.BatchTweets(Xt, yt, labeldict, batch_size=N_BATCH, max_classes=MAX_CLASSES)
    val_iter = batch.BatchTweets(Xv, yv, labeldict, batch_size=N_BATCH, max_classes=MAX_CLASSES, test=True)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    targets = T.ivector()
    # masks
    t_mask = T.fmatrix()

    # network for prediction
    predictions, net, emb = classify(tweet, t_mask, params, n_classes, n_char)

    # batch loss
    loss = lasagne.objectives.categorical_crossentropy(predictions, targets)
    cost = T.mean(loss) + REGULARIZATION*lasagne.regularization.regularize_network_params(net, lasagne.regularization.l2)
    cost_only = T.mean(loss)
    reg_only = REGULARIZATION*lasagne.regularization.regularize_network_params(net, lasagne.regularization.l2)

    # params and updates
    print("Computing updates...")
    lr = LEARNING_RATE
    mu = MOMENTUM
    updates = lasagne.updates.nesterov_momentum(cost, lasagne.layers.get_all_params(net), lr, momentum=mu)

    # Theano function
    print("Compiling theano functions...")
    inps = [tweet,t_mask,targets]
    predict = theano.function([tweet,t_mask],predictions)
    cost_val = theano.function(inps,[cost_only,emb])
    train = theano.function(inps,cost,updates=updates)
    reg_val = theano.function([],reg_only)

    # Training
    print("Training...")
    uidx = 0
    maxp = 0.
    start = time.time()
    valcosts = []
    try:
	for epoch in range(num_epochs):
	    n_samples = 0
            train_cost = 0.
	    print("Epoch {}".format(epoch))

            # learning schedule
            if len(valcosts) > 1 and SCHEDULE:
                change = (valcosts[-1]-valcosts[-2])/abs(valcosts[-2])
                if change < T1:
                    lr, mu = schedule(lr, mu)
                    updates = lasagne.updates.nesterov_momentum(cost, lasagne.layers.get_all_params(net), lr, momentum=mu)
                    train = theano.function(inps,cost,updates=updates)
                    T1 = T1/2

            # stopping criterion
            if len(valcosts) > 6:
                deltas = []
                for i in range(5):
                    deltas.append((valcosts[-i-1]-valcosts[-i-2])/abs(valcosts[-i-2]))
                if sum(deltas)/len(deltas) < T2:
                    break

            ud_start = time.time()
	    for xr,y in train_iter:
		n_samples +=len(xr)
		uidx += 1
		x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char)
		if x==None:
		    print("Minibatch with zero samples under maxlength.")
		    uidx -= 1
		    continue

		curr_cost = train(x,x_m,y)
                train_cost += curr_cost*len(xr)
		ud = time.time() - ud_start

		if np.isnan(curr_cost) or np.isinf(curr_cost):
		    print("Nan detected.")
		    return

		if np.mod(uidx, DISPF) == 0:
		    print("Epoch {} Update {} Cost {} Time {}".format(epoch,uidx,curr_cost,ud))

		if np.mod(uidx,SAVEF) == 0:
		    print("Saving...")
		    saveparams = OrderedDict()
		    for kk,vv in params.iteritems():
			saveparams[kk] = vv.get_value()
		    np.savez('%s/model.npz' % save_path,**saveparams)
		    print("Done.")

            print("Testing on Validation set...")
            preds = []
            targs = []
	    for xr,y in val_iter:
		x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char)
		if x==None:
                    print("Validation: Minibatch with zero samples under maxlength.")
		    continue

                vp = predict(x,x_m)
                ranks = np.argsort(vp)[:,::-1]
                for idx,item in enumerate(xr):
                    preds.append(ranks[idx,:])
                    targs.append(y[idx])

            validation_cost = precision(np.asarray(preds),targs,1)
            regularization_cost = reg_val()

            if validation_cost > maxp:
                maxp = validation_cost
                saveparams = OrderedDict()
                for kk,vv in params.iteritems():
                    saveparams[kk] = vv.get_value()
                np.savez('%s/best_model.npz' % (save_path),**saveparams)

	    print("Epoch {} Training Cost {} Validation Precision {} Regularization Cost {} Max Precision {}".format(epoch, train_cost/n_samples, validation_cost, regularization_cost, maxp))
	    print("Seen {} samples.".format(n_samples))
            valcosts.append(validation_cost)

            print("Saving...")
            saveparams = OrderedDict()
            for kk,vv in params.iteritems():
                saveparams[kk] = vv.get_value()
            np.savez('%s/model_%d.npz' % (save_path,epoch),**saveparams)
            print("Done.")

    except KeyboardInterrupt:
	pass
    print("Total training time = {}".format(time.time()-start))
コード例 #5
0
ファイル: test_char.py プロジェクト: MorLong/tweet2vec
def main(args):

    data_path = args[0]
    model_path = args[1]
    save_path = args[2]
    if len(args)>3:
        m_num = int(args[3])

    print("Preparing Data...")
    # Test data
    Xt = []
    yt = []
    with io.open(data_path,'r',encoding='utf-8') as f:
        for line in f:
            (yc, Xc) = line.rstrip('\n').split('\t')
            Xt.append(Xc[:MAX_LENGTH])
            yt.append(yc.split(','))

    # Model
    print("Loading model params...")
    if len(args)>3:
        params = load_params('%s/model_%d.npz' % (model_path,m_num))
    else:
        params = load_params('%s/best_model.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = len(chardict.keys()) + 1
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)

    # iterators
    test_iter = batch.BatchTweets(Xt, yt, labeldict, batch_size=N_BATCH, max_classes=MAX_CLASSES, test=True)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    targets = T.imatrix()

    # masks
    t_mask = T.fmatrix()

    # network for prediction
    predictions = classify(tweet, t_mask, params, n_classes, n_char)[0]
    embeddings = classify(tweet, t_mask, params, n_classes, n_char)[1]

    # Theano function
    print("Compiling theano functions...")
    predict = theano.function([tweet,t_mask],predictions)
    encode = theano.function([tweet,t_mask],embeddings)

    # Test
    print("Testing...")
    out_data = []
    out_pred = []
    out_emb = []
    out_target = []
    for xr,y in test_iter:
        x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char)
        p = predict(x,x_m)
        e = encode(x,x_m)
        ranks = np.argsort(p)[:,::-1]

        for idx, item in enumerate(xr):
            out_data.append(item)
            out_pred.append(ranks[idx,:])
            out_emb.append(e[idx,:])
            out_target.append(y[idx])

    # Save
    print("Saving...")
    with open('%s/data.pkl'%save_path,'w') as f:
        pkl.dump(out_data,f)
    with open('%s/predictions.npy'%save_path,'w') as f:
        np.save(f,np.asarray(out_pred))
    with open('%s/embeddings.npy'%save_path,'w') as f:
        np.save(f,np.asarray(out_emb))
    with open('%s/targets.pkl'%save_path,'w') as f:
        pkl.dump(out_target,f)
コード例 #6
0
def main(args):

    data_path = args[0]
    model_path = args[1]
    save_path = args[2]
    if len(args)>3:
        m_num = int(args[3])

    print("Preparing Data...")
    # Test data
    Xt = []
    with io.open(data_path,'r',encoding='utf-8') as f:
        for line in f:
            Xc = line.rstrip('\n')
            Xt.append(Xc[:MAX_LENGTH])

    # Model
    print("Loading model params...")
    if len(args)>3:
        params = load_params('%s/model_%d.npz' % (model_path,m_num))
    else:
        params = load_params('%s/best_model.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = len(chardict.keys()) + 1
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)
    inverse_labeldict = invert(labeldict)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    t_mask = T.fmatrix()

    # network for prediction
    predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char)

    # Theano function
    print("Compiling theano functions...")
#     predict = theano.function([tweet,t_mask],predictions)
    encode = theano.function([tweet,t_mask],embeddings)

    # Test
    print("Encoding...")
    out_pred = []
    out_emb = []
    numbatches = len(Xt)/N_BATCH + 1
    print("Num Batches: "+str(numbatches))
    for i in range(int(numbatches)):
        print("processing batch "+str(i))
        xr = Xt[N_BATCH*i:N_BATCH*(i+1)]
        x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char)
#         p = predict(x,x_m)
        e = encode(x,x_m)
#         ranks = np.argsort(p)[:,::-1]

        for idx, item in enumerate(xr):
#             out_pred.append(' '.join([inverse_labeldict[r] if r in inverse_labeldict else 'UNK' for r in ranks[idx,:5]]))
            out_emb.append(e[idx,:])
        print("saving")
        stansFileName = save_path+"/embeddings_w266_"+str(i)+".npy"
        stansNPArr = np.asarray(out_emb)
        np.save(stansFileName,stansNPArr)
        out_emb = []
    print("DONE")
def annotate_s2s_text():
    """
    テキストファイルにタグを付与する関数
    :return:
    """
    # path
    model_path = MODEL_PATH
    text_path = TEST_INPUT
    save_path = SAVE_PATH

    # seq2seq用regex
    pattern = "(.+?)(\n|\r\n)"
    r = re.compile(pattern)

    print("Loading model params...")
    params = load_params('%s/best_model.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = len(chardict.keys()) + 1
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)
    inverse_labeldict = invert(labeldict)

    print("Building network...")
    tweet = T.itensor3()
    t_mask = T.fmatrix()
    predictions, embeddings = classify(tweet, t_mask, params, n_classes,
                                       n_char)

    print("Compiling theano functions...")
    predict = theano.function([tweet, t_mask], predictions)

    # Encoding cmnts
    cmnts = []
    Xt = []
    for line in io.open(text_path, 'r', encoding='utf-8'):
        m = r.search(line)
        if m is not None:
            cmnts.append(m.group(1))
            Xc_cmnt = m.group(1).replace(' ',
                                         '')  # 半角スペースの除去(tweet2vecに入力するため)
            Xt.append(Xc_cmnt[:MAX_LENGTH])

    out_pred = []
    numbatches = len(Xt) / N_BATCH + 1
    print 'number of batches', numbatches
    for i in range(numbatches):
        xr = Xt[N_BATCH * i:N_BATCH * (i + 1)]
        x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char)
        p = predict(x, x_m)
        ranks = np.argsort(p)[:, ::-1]
        for idx, item in enumerate(xr):
            out_pred.append([
                inverse_labeldict[r] if r in inverse_labeldict else 'UNK'
                for r in ranks[idx, :5]
            ][0])
        print i, 'batches end...'

    # Save result
    with io.open(save_path, 'w') as f:
        for tag, cmnt in zip(out_pred, cmnts):
            f.write(tag + '\t' + cmnt + '\n')
コード例 #8
0
def generate_embeddings(args):

    data_path = args[0]
    model_path = args[1]
    # save_path = args[2]
    if len(args) > 2:
        m_num = int(args[2])

    print("Preparing Data...")
    # Test data
    # read tweet texts into an array
    Xt = []

    # read from file
    with io.open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            Xc = line.rstrip('\n')
            Xt.append(Xc[:MAX_LENGTH])
    print "Tweets:", len(Xt)
    print "Unique tweets:", len(set(Xt))
    # Model
    print("Loading model params...")
    if len(args) > 3:
        params = load_params('%s/model_%d.npz' % (model_path, m_num))
    else:
        params = load_params('%s/best_model.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = len(chardict.keys()) + 1
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)
    inverse_labeldict = invert(labeldict)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    t_mask = T.fmatrix()

    # network for prediction
    predictions, embeddings = classify(tweet, t_mask, params, n_classes,
                                       n_char)

    # Theano function
    print("Compiling theano functions...")
    predict = theano.function([tweet, t_mask], predictions)
    encode = theano.function([tweet, t_mask], embeddings)

    # Test
    print("Encoding...")
    out_data = []
    out_pred = []
    out_emb = []
    numbatches = len(Xt) / N_BATCH + 1
    for i in range(numbatches):
        xr = Xt[N_BATCH * i:N_BATCH * (i + 1)]
        x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char)
        p = predict(x, x_m)
        e = encode(x, x_m)
        ranks = np.argsort(p)[:, ::-1]
        for idx, item in enumerate(xr):
            out_data.append(item)
            # print [r for r in ranks[idx,:5]]
            # out_pred.append(' '.join([inverse_labeldict[r] for r in ranks[idx,:5]]))
            out_pred.append(ranks[idx, :])
            out_emb.append(e[idx, :])
    return out_emb
コード例 #9
0
ファイル: encode_char.py プロジェクト: Quartz/tweet2vec
def main(args):

    data_path = args[0]
    model_path = args[1]
    save_path = args[2]
    if len(args) > 3:
        m_num = int(args[3])

    print("Preparing Data...")
    # Test data
    Xt = []
    with io.open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            # JK/QUARTZ
            # here taking the string only up to the tab character we added
            # to every trump tweet line. Was:
            # Xc = line.rstrip('\n')
            Xc = re.match(r'^(.*)\t', line).group(1).rstrip('\n')
            print(Xc)
            Xt.append(Xc[:MAX_LENGTH])

    # Model
    print("Loading model params...")
    if len(args) > 3:
        params = load_params('%s/model_%d.npz' % (model_path, m_num))
    else:
        params = load_params('%s/best_model.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = len(chardict.keys()) + 1
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)
    inverse_labeldict = invert(labeldict)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    t_mask = T.fmatrix()

    # network for prediction
    predictions, embeddings = classify(tweet, t_mask, params, n_classes,
                                       n_char)

    # Theano function
    print("Compiling theano functions...")
    # JK/QUARTZ Disabling the prediction function, because we just need the vectoring
    # predict = theano.function([tweet,t_mask],predictions)
    encode = theano.function([tweet, t_mask], embeddings)

    # Test
    print("Encoding...")
    # JK/QUARTZ Disabling the prediction lines, because we just need the vectoring
    # out_pred = []
    out_emb = []
    numbatches = len(Xt) / N_BATCH + 1
    for i in range(numbatches):
        xr = Xt[N_BATCH * i:N_BATCH * (i + 1)]
        x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char)
        # JK/QUARTZ Disabling the prediction function, because we just need the vectoring
        # p = predict(x,x_m)
        e = encode(x, x_m)
        # JK/QUARTZ Disabling the prediction lines, because we just need the vectoring
        # ranks = np.argsort(p)[:,::-1]

        for idx, item in enumerate(xr):
            # JK/QUARTZ Disabling the prediction lines, because we just need the vectoring
            # out_pred.append(' '.join([inverse_labeldict[r] if r in inverse_labeldict else 'UNK' for r in ranks[idx,:5]]))
            out_emb.append(e[idx, :])

    # Save
    print("Saving...")
    # JK/QUARTZ Disabling the prediction lines, because we just need the vectoring
    # with io.open('%s/predicted_tags.txt'%save_path,'w') as f:
    #     for item in out_pred:
    #         f.write(item + '\n')
    with open('%s/embeddings.npy' % save_path, 'w') as f:
        np.save(f, np.asarray(out_emb))
コード例 #10
0
def main(args):

    data_path = args[0]
    model_path = args[1]
    save_path = args[2]
    if len(args) > 3:
        m_num = int(args[3])

    print("Preparing Data...")
    # Test data
    Xt = []
    yt = []
    with io.open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            (yc, Xc) = line.rstrip('\n').split('\t')
            Xt.append(Xc[:MAX_LENGTH])
            yt.append(yc.split(','))

    # Model
    print("Loading model params...")
    if len(args) > 3:
        params = load_params('%s/model_%d.npz' % (model_path, m_num))
    else:
        params = load_params('%s/best_model.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = len(chardict.keys()) + 1
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)

    # iterators
    test_iter = batch.BatchTweets(Xt,
                                  yt,
                                  labeldict,
                                  batch_size=N_BATCH,
                                  max_classes=MAX_CLASSES,
                                  test=True)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    targets = T.imatrix()

    # masks
    t_mask = T.fmatrix()

    # network for prediction
    predictions = classify(tweet, t_mask, params, n_classes, n_char)[0]
    embeddings = classify(tweet, t_mask, params, n_classes, n_char)[1]

    # Theano function
    print("Compiling theano functions...")
    predict = theano.function([tweet, t_mask], predictions)
    encode = theano.function([tweet, t_mask], embeddings)

    # Test
    print("Testing...")
    out_data = []
    out_pred = []
    out_emb = []
    out_target = []
    for xr, y in test_iter:
        x, x_m = batch.prepare_data(xr, chardict, n_chars=n_char)
        p = predict(x, x_m)
        e = encode(x, x_m)
        ranks = np.argsort(p)[:, ::-1]

        for idx, item in enumerate(xr):
            out_data.append(item)
            out_pred.append(ranks[idx, :])
            out_emb.append(e[idx, :])
            out_target.append(y[idx])

    # Save
    print("Saving...")
    with open('%s/data.pkl' % save_path, 'w') as f:
        pkl.dump(out_data, f)
    with open('%s/predictions.npy' % save_path, 'w') as f:
        np.save(f, np.asarray(out_pred))
    with open('%s/embeddings.npy' % save_path, 'w') as f:
        np.save(f, np.asarray(out_emb))
    with open('%s/targets.pkl' % save_path, 'w') as f:
        pkl.dump(out_target, f)