Example #1
0
def main(args):

    data_path = args[0]
    model_path = args[1]
    save_path = args[2]
    if len(args) > 3:
        m_num = int(args[3])

    print ("Preparing Data...")
    # Test data
    Xt = []
    yt = []
    with io.open(data_path, "r", encoding="utf-8") as f:
        for line in f:
            (yc, Xc) = line.rstrip("\n").split("\t")
            Xt.append(Xc)
            yt.append(yc.split(","))

    # Model
    print ("Loading model params...")
    if len(args) > 3:
        print "Loading %s/model_%d.npz" % (model_path, m_num)
        params = load_params("%s/model_%d.npz" % (model_path, m_num))
    else:
        print "Loading %s/best_model.npz" % model_path
        params = load_params("%s/best_model.npz" % model_path)

    print ("Loading dictionaries...")
    with open("%s/dict.pkl" % model_path, "rb") as f:
        chardict = pkl.load(f)
    with open("%s/label_dict.pkl" % model_path, "rb") as f:
        labeldict = pkl.load(f)
    n_char = min(len(chardict.keys()) + 1, N_WORD)
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)

    # iterators
    test_iter = batch.BatchTweets(Xt, yt, labeldict, batch_size=N_BATCH, max_classes=MAX_CLASSES, test=True)

    print ("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    targets = T.imatrix()
    # masks
    t_mask = T.fmatrix()

    # network for prediction
    predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char)

    # Theano function
    print ("Compiling theano functions...")
    predict = theano.function([tweet, t_mask], predictions)
    encode = theano.function([tweet, t_mask], embeddings)

    # Test
    print ("Testing...")
    out_data = []
    out_pred = []
    out_emb = []
    out_target = []
    for xr, y in test_iter:
        x, x_m = batch.prepare_data(xr, chardict, n_tokens=n_char)
        p = predict(x, x_m)
        e = encode(x, x_m)
        ranks = np.argsort(p)[:, ::-1]

        for idx, item in enumerate(xr):
            out_data.append(item)
            out_pred.append(ranks[idx, :])
            out_emb.append(e[idx, :])
            out_target.append(y[idx])

    # Save
    print ("Saving...")
    with open("%s/data.pkl" % save_path, "w") as f:
        pkl.dump(out_data, f)
    with open("%s/predictions.npy" % save_path, "w") as f:
        np.save(f, np.asarray(out_pred))
    with open("%s/embeddings.npy" % save_path, "w") as f:
        np.save(f, np.asarray(out_emb))
    with open("%s/targets.pkl" % save_path, "w") as f:
        pkl.dump(out_target, f)
Example #2
0
def main(train_path,val_path,save_path,wordvec=None,num_epochs=NUM_EPOCHS):
    global T1

    # save settings
    shutil.copyfile('src/settings_word.py','%s/settings_word.txt'%save_path)

    print("Preparing Data...")
    # Training data
    Xt = []
    yt = []
    with io.open(train_path,'r',encoding='utf-8') as f:
        for line in f:
            (yc, Xc) = line.rstrip('\n').split('\t')
            Xt.append(Xc)
            yt.append(yc)
    # Validation data
    Xv = []
    yv = []
    with io.open(val_path,'r',encoding='utf-8') as f:
        for line in f:
            (yc, Xc) = line.rstrip('\n').split('\t')
            Xv.append(Xc)
            yv.append(yc.split(','))

    print("Preparing Model...")
    if not RELOAD_MODEL:
        # Build dictionaries from training data
        tokendict, tokencount = batch.build_dictionary(Xt)
        n_token = min(len(tokendict.keys()) + 1, N_WORD)
        batch.save_dictionary(tokendict,tokencount,'%s/dict.pkl' % save_path)
        # params
        if wordvec is None:
            params = init_params(n_chars=n_token)
        else:
            params = init_params_add_wordvec(tokendict, n_chars=n_token, vector_bin=wordvec)
        
        labeldict, labelcount = batch.build_label_dictionary(yt)
        batch.save_dictionary(labeldict, labelcount, '%s/label_dict.pkl' % save_path)

        n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)

        # classification params
        params['W_cl'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(2*WDIM,n_classes)).astype('float32'), name='W_cl')
        params['b_cl'] = theano.shared(np.zeros((n_classes)).astype('float32'), name='b_cl')

    else:
        print("Loading model params...")
        params = load_params_shared('%s/best_model.npz' % save_path)

        print("Loading dictionaries...")
        with open('%s/dict.pkl' % save_path, 'rb') as f:
            tokendict = pkl.load(f)
        with open('%s/label_dict.pkl' % save_path, 'rb') as f:
            labeldict = pkl.load(f)
        n_token = min(len(tokendict.keys()) + 1, N_WORD)
        n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)

    # iterators
    train_iter = batch.BatchTweets(Xt, yt, labeldict, batch_size=N_BATCH, max_classes=MAX_CLASSES)
    val_iter = batch.BatchTweets(Xv, yv, labeldict, batch_size=N_BATCH, max_classes=MAX_CLASSES, test=True)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    targets = T.ivector()

    # masks
    t_mask = T.fmatrix()

    # network for prediction
    predictions, net, emb = classify(tweet, t_mask, params, n_classes, n_token)

    # batch loss
    loss = lasagne.objectives.categorical_crossentropy(predictions, targets)
    cost = T.mean(loss) + REGULARIZATION*lasagne.regularization.regularize_network_params(net, lasagne.regularization.l2)
    cost_only = T.mean(loss)
    reg_only = REGULARIZATION*lasagne.regularization.regularize_network_params(net, lasagne.regularization.l2)

    # params and updates
    print("Computing updates...")
    global SCHEDULE
    lr = LEARNING_RATE
    if ADADELTA:
        SCHEDULE = False
        updates = lasagne.updates.adadelta(cost, lasagne.layers.get_all_params(net), learning_rate=lr)
    else:
        mu = MOMENTUM
        updates = lasagne.updates.nesterov_momentum(cost, lasagne.layers.get_all_params(net), lr, momentum=mu)

    # Theano function
    print("Compiling theano functions...")
    inps = [tweet,t_mask,targets]
    predict = theano.function([tweet,t_mask],predictions)
    encode = theano.function([tweet,t_mask],emb)
    cost_val = theano.function(inps,[cost_only,emb])
    train = theano.function(inps,cost,updates=updates)
    reg_val = theano.function([],reg_only)

    # Training
    print("Training...")
    uidx = 0
    maxp = 0.
    start = time.time()
    valcosts = []
    try:
        for epoch in range(num_epochs):
            n_samples = 0
            train_cost = 0.
            print("Epoch {}".format(epoch))

            # learning schedule
            if len(valcosts) > 1 and SCHEDULE:
                change = (valcosts[-1]-valcosts[-2])/abs(valcosts[-2])
                if change < T1:
                    lr, mu = schedule(lr, mu)
                    updates = lasagne.updates.nesterov_momentum(cost, lasagne.layers.get_all_params(net), lr, momentum=mu)
                    train = theano.function(inps,cost,updates=updates)
                    T1 = T1/2

            # stopping criterion
            if len(valcosts) > 6:
                deltas = []
                for i in range(5):
                    deltas.append((valcosts[-i-1]-valcosts[-i-2])/abs(valcosts[-i-2]))
                if sum(deltas)/len(deltas) < T2:
                    break

            ud_start = time.time()
            for xr,y in train_iter:
                n_samples +=len(xr)
                uidx += 1
                x, x_m = batch.prepare_data(xr, tokendict, n_tokens=n_token)
                if x is None:
                    print("Minibatch with zero samples under maxlength.")
                    uidx -= 1
                    continue

                curr_cost = train(x,x_m,y)
                train_cost += curr_cost*len(xr)
                ud = time.time() - ud_start

                if np.isnan(curr_cost) or np.isinf(curr_cost):
                    print("Nan detected.")
                    return

                if np.mod(uidx, DISPF) == 0:
                    print("Epoch {} Update {} Cost {} Time {}".format(epoch,uidx,curr_cost,ud))
                    sys.stdout.flush()

                if np.mod(uidx,SAVEF) == 0:
                    print("Saving...")
                    saveparams = OrderedDict()
                    for kk,vv in params.iteritems():
                        saveparams[kk] = vv.get_value()
                    np.savez('%s/model.npz' % save_path,**saveparams)
                    print("Done.")

            print("Testing on Validation set...")
            preds = []
            targs = []
            for xr,y in val_iter:
                x, x_m = batch.prepare_data(xr, tokendict, n_tokens=n_token)
                if x is None:
                    print("Validation: Minibatch with zero samples under maxlength.")
                    continue

                vp = predict(x,x_m)
                ranks = np.argsort(vp)[:,::-1]
                for idx,item in enumerate(xr):
                    preds.append(ranks[idx,:])
                    targs.append(y[idx])

            validation_cost = precision(np.asarray(preds),targs,1)
            regularization_cost = reg_val()

            if validation_cost > maxp:
                maxp = validation_cost
                saveparams = OrderedDict()
                for kk,vv in params.iteritems():
                    saveparams[kk] = vv.get_value()
                np.savez('%s/best_model.npz' % (save_path),**saveparams)

            print("Epoch {} Training Cost {} Validation Precision {} Regularization Cost {} Max Precision {}".format(epoch, train_cost/n_samples, validation_cost, regularization_cost, maxp))
            print("Seen {} samples.".format(n_samples))
            valcosts.append(validation_cost)

            print("Saving...")
            saveparams = OrderedDict()
            for kk,vv in params.iteritems():
                saveparams[kk] = vv.get_value()
            np.savez('%s/model_%d.npz' % (save_path,epoch),**saveparams)
            print("Done.")

    except KeyboardInterrupt:
        pass
    print("Total training time = {}".format(time.time()-start))
Example #3
0
def main(args):

    data_path = args[0]
    model_path = args[1]
    save_path = args[2]
    if len(args)>3:
        m_num = int(args[3])

    print("Preparing Data...")
    # Test data
    Xt = []
    with io.open(data_path,'r',encoding='utf-8') as f:
        for line in f:
            Xc = line.rstrip('\n')
            Xt.append(Xc)

    # Model
    print("Loading model params...")
    if len(args)>3:
        params = load_params('%s/model_%d.npz' % (model_path,m_num))
    else:
        params = load_params('%s/best_model.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = min(len(chardict.keys()) + 1, N_WORD)
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)
    inverse_labeldict = invert(labeldict)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    t_mask = T.fmatrix()

    # network for prediction
    predictions, embeddings = classify(tweet, t_mask, params, n_classes, n_char)

    # Theano function
    print("Compiling theano functions...")
    predict = theano.function([tweet,t_mask],predictions)
    encode = theano.function([tweet,t_mask],embeddings)

    # Test
    print("Encoding...")
    out_pred = []
    out_emb = []
    numbatches = len(Xt)/N_BATCH + 1
    for i in range(numbatches):
        xr = Xt[N_BATCH*i:N_BATCH*(i+1)]
        x, x_m = batch.prepare_data(xr, chardict, n_tokens=n_char)
        p = predict(x,x_m)
        e = encode(x,x_m)
        ranks = np.argsort(p)[:,::-1]

        for idx, item in enumerate(xr):
            out_pred.append(' '.join([inverse_labeldict[r] for r in ranks[idx,:5]]))
            out_emb.append(e[idx,:])

    # Save
    print("Saving...")
    with io.open('%s/predicted_tags.txt'%save_path,'w') as f:
        for item in out_pred:
            f.write(item + '\n')
    with open('%s/embeddings.npy'%save_path,'w') as f:
        np.save(f,np.asarray(out_emb))
Example #4
0
def main(args):

    data_path = args[0]
    model_path = args[1]
    save_path = args[2]
    if len(args) > 3:
        m_num = int(args[3])

    print("Preparing Data...")
    # Test data
    Xt = []
    with io.open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            Xc = line.rstrip('\n')
            Xt.append(Xc)

    # Model
    print("Loading model params...")
    if len(args) > 3:
        params = load_params('%s/model_%d.npz' % (model_path, m_num))
    else:
        params = load_params('%s/best_model.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = min(len(chardict.keys()) + 1, N_WORD)
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)
    inverse_labeldict = invert(labeldict)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    t_mask = T.fmatrix()

    # network for prediction
    predictions, embeddings = classify(tweet, t_mask, params, n_classes,
                                       n_char)

    # Theano function
    print("Compiling theano functions...")
    predict = theano.function([tweet, t_mask], predictions)
    encode = theano.function([tweet, t_mask], embeddings)

    # Test
    print("Encoding...")
    out_pred = []
    out_emb = []
    numbatches = len(Xt) / N_BATCH + 1
    for i in range(numbatches):
        xr = Xt[N_BATCH * i:N_BATCH * (i + 1)]
        x, x_m = batch.prepare_data(xr, chardict, n_tokens=n_char)
        p = predict(x, x_m)
        e = encode(x, x_m)
        ranks = np.argsort(p)[:, ::-1]

        for idx, item in enumerate(xr):
            out_pred.append(' '.join(
                [inverse_labeldict[r] for r in ranks[idx, :5]]))
            out_emb.append(e[idx, :])

    # Save
    print("Saving...")
    with io.open('%s/predicted_tags.txt' % save_path, 'w') as f:
        for item in out_pred:
            f.write(item + '\n')
    with open('%s/embeddings.npy' % save_path, 'w') as f:
        np.save(f, np.asarray(out_emb))
Example #5
0
def main(args):

    data_path = args[0]
    model_path = args[1]
    save_path = args[2]
    if len(args) > 3:
        m_num = int(args[3])

    print("Preparing Data...")
    # Test data
    Xt = []
    yt = []
    with io.open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            (yc, Xc) = line.rstrip('\n').split('\t')
            Xt.append(Xc)
            yt.append(yc.split(','))

    # Model
    print("Loading model params...")
    if len(args) > 3:
        print 'Loading %s/model_%d.npz' % (model_path, m_num)
        params = load_params('%s/model_%d.npz' % (model_path, m_num))
    else:
        print 'Loading %s/best_model.npz' % model_path
        params = load_params('%s/best_model.npz' % model_path)

    print("Loading dictionaries...")
    with open('%s/dict.pkl' % model_path, 'rb') as f:
        chardict = pkl.load(f)
    with open('%s/label_dict.pkl' % model_path, 'rb') as f:
        labeldict = pkl.load(f)
    n_char = min(len(chardict.keys()) + 1, N_WORD)
    n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)

    # iterators
    test_iter = batch.BatchTweets(Xt,
                                  yt,
                                  labeldict,
                                  batch_size=N_BATCH,
                                  max_classes=MAX_CLASSES,
                                  test=True)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    targets = T.imatrix()
    # masks
    t_mask = T.fmatrix()

    # network for prediction
    predictions, embeddings = classify(tweet, t_mask, params, n_classes,
                                       n_char)

    # Theano function
    print("Compiling theano functions...")
    predict = theano.function([tweet, t_mask], predictions)
    encode = theano.function([tweet, t_mask], embeddings)

    # Test
    print("Testing...")
    out_data = []
    out_pred = []
    out_emb = []
    out_target = []
    for xr, y in test_iter:
        x, x_m = batch.prepare_data(xr, chardict, n_tokens=n_char)
        p = predict(x, x_m)
        e = encode(x, x_m)
        ranks = np.argsort(p)[:, ::-1]

        for idx, item in enumerate(xr):
            out_data.append(item)
            out_pred.append(ranks[idx, :])
            out_emb.append(e[idx, :])
            out_target.append(y[idx])

    # Save
    print("Saving...")
    with open('%s/data.pkl' % save_path, 'w') as f:
        pkl.dump(out_data, f)
    with open('%s/predictions.npy' % save_path, 'w') as f:
        np.save(f, np.asarray(out_pred))
    with open('%s/embeddings.npy' % save_path, 'w') as f:
        np.save(f, np.asarray(out_emb))
    with open('%s/targets.pkl' % save_path, 'w') as f:
        pkl.dump(out_target, f)
Example #6
0
def main(train_path, val_path, save_path, num_epochs=NUM_EPOCHS):
    global T1

    # save settings
    shutil.copyfile('settings_word.py', '%s/settings_word.txt' % save_path)

    print("Preparing Data...")
    # Training data
    Xt = []
    yt = []
    with io.open(train_path, 'r', encoding='utf-8') as f:
        for line in f:
            (yc, Xc) = line.rstrip('\n').split('\t')
            Xt.append(Xc)
            yt.append(yc)
    # Validation data
    Xv = []
    yv = []
    with io.open(val_path, 'r', encoding='utf-8') as f:
        for line in f:
            (yc, Xc) = line.rstrip('\n').split('\t')
            Xv.append(Xc)
            yv.append(yc.split(','))

    print("Preparing Model...")
    if not RELOAD_MODEL:
        # Build dictionaries from training data
        tokendict, tokencount = batch.build_dictionary(Xt)
        n_token = min(len(tokendict.keys()) + 1, N_WORD)
        batch.save_dictionary(tokendict, tokencount, '%s/dict.pkl' % save_path)
        # params
        params = init_params(n_chars=n_token)

        labeldict, labelcount = batch.build_label_dictionary(yt)
        batch.save_dictionary(labeldict, labelcount,
                              '%s/label_dict.pkl' % save_path)

        n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)

        # classification params
        params['W_cl'] = theano.shared(np.random.normal(
            loc=0., scale=SCALE, size=(WDIM, n_classes)).astype('float32'),
                                       name='W_cl')
        params['b_cl'] = theano.shared(np.zeros((n_classes)).astype('float32'),
                                       name='b_cl')

    else:
        print("Loading model params...")
        params = load_params_shared('%s/best_model.npz' % save_path)

        print("Loading dictionaries...")
        with open('%s/dict.pkl' % save_path, 'rb') as f:
            tokendict = pkl.load(f)
        with open('%s/label_dict.pkl' % save_path, 'rb') as f:
            labeldict = pkl.load(f)
        n_token = min(len(tokendict.keys()) + 1, N_WORD)
        n_classes = min(len(labeldict.keys()) + 1, MAX_CLASSES)

    # iterators
    train_iter = batch.BatchTweets(Xt,
                                   yt,
                                   labeldict,
                                   batch_size=N_BATCH,
                                   max_classes=MAX_CLASSES)
    val_iter = batch.BatchTweets(Xv,
                                 yv,
                                 labeldict,
                                 batch_size=N_BATCH,
                                 max_classes=MAX_CLASSES,
                                 test=True)

    print("Building network...")
    # Tweet variables
    tweet = T.itensor3()
    targets = T.ivector()

    # masks
    t_mask = T.fmatrix()

    # network for prediction
    predictions, net, emb = classify(tweet, t_mask, params, n_classes, n_token)

    # batch loss
    loss = lasagne.objectives.categorical_crossentropy(predictions, targets)
    cost = T.mean(
        loss
    ) + REGULARIZATION * lasagne.regularization.regularize_network_params(
        net, lasagne.regularization.l2)
    cost_only = T.mean(loss)
    reg_only = REGULARIZATION * lasagne.regularization.regularize_network_params(
        net, lasagne.regularization.l2)

    # params and updates
    print("Computing updates...")
    lr = LEARNING_RATE
    mu = MOMENTUM
    updates = lasagne.updates.nesterov_momentum(
        cost, lasagne.layers.get_all_params(net), lr, momentum=mu)

    # Theano function
    print("Compiling theano functions...")
    inps = [tweet, t_mask, targets]
    predict = theano.function([tweet, t_mask], predictions)
    encode = theano.function([tweet, t_mask], emb)
    cost_val = theano.function(inps, [cost_only, emb])
    train = theano.function(inps, cost, updates=updates)
    reg_val = theano.function([], reg_only)

    # Training
    print("Training...")
    uidx = 0
    maxp = 0.
    start = time.time()
    valcosts = []
    try:
        for epoch in range(num_epochs):
            n_samples = 0
            train_cost = 0.
            print("Epoch {}".format(epoch))

            # learning schedule
            if len(valcosts) > 1 and SCHEDULE:
                change = (valcosts[-1] - valcosts[-2]) / abs(valcosts[-2])
                if change < T1:
                    lr, mu = schedule(lr, mu)
                    updates = lasagne.updates.nesterov_momentum(
                        cost,
                        lasagne.layers.get_all_params(net),
                        lr,
                        momentum=mu)
                    train = theano.function(inps, cost, updates=updates)
                    T1 = T1 / 2

    # stopping criterion
            if len(valcosts) > 6:
                deltas = []
                for i in range(5):
                    deltas.append((valcosts[-i - 1] - valcosts[-i - 2]) /
                                  abs(valcosts[-i - 2]))
                if sum(deltas) / len(deltas) < T2:
                    break

            ud_start = time.time()
            for xr, y in train_iter:
                n_samples += len(xr)
                uidx += 1
                x, x_m = batch.prepare_data(xr, tokendict, n_tokens=n_token)
                if x == None:
                    print("Minibatch with zero samples under maxlength.")
                    uidx -= 1
                    continue

                curr_cost = train(x, x_m, y)
                train_cost += curr_cost * len(xr)
                ud = time.time() - ud_start

                if np.isnan(curr_cost) or np.isinf(curr_cost):
                    print("Nan detected.")
                    return

                if np.mod(uidx, DISPF) == 0:
                    print("Epoch {} Update {} Cost {} Time {}".format(
                        epoch, uidx, curr_cost, ud))

                if np.mod(uidx, SAVEF) == 0:
                    print("Saving...")
                    saveparams = OrderedDict()
                    for kk, vv in params.iteritems():
                        saveparams[kk] = vv.get_value()
                    np.savez('%s/model.npz' % save_path, **saveparams)
                    print("Done.")

            print("Testing on Validation set...")
            preds = []
            targs = []
            for xr, y in val_iter:
                x, x_m = batch.prepare_data(xr, tokendict, n_tokens=n_token)
                if x == None:
                    print(
                        "Validation: Minibatch with zero samples under maxlength."
                    )
                    continue

                vp = predict(x, x_m)
                ranks = np.argsort(vp)[:, ::-1]
                for idx, item in enumerate(xr):
                    preds.append(ranks[idx, :])
                    targs.append(y[idx])

            validation_cost = precision(np.asarray(preds), targs, 1)
            regularization_cost = reg_val()

            if validation_cost > maxp:
                maxp = validation_cost
                saveparams = OrderedDict()
                for kk, vv in params.iteritems():
                    saveparams[kk] = vv.get_value()
                np.savez('%s/best_model.npz' % (save_path), **saveparams)

            print(
                "Epoch {} Training Cost {} Validation Precision {} Regularization Cost {} Max Precision {}"
                .format(epoch, train_cost / n_samples, validation_cost,
                        regularization_cost, maxp))
            print("Seen {} samples.".format(n_samples))
            valcosts.append(validation_cost)

            print("Saving...")
            saveparams = OrderedDict()
            for kk, vv in params.iteritems():
                saveparams[kk] = vv.get_value()
            np.savez('%s/model_%d.npz' % (save_path, epoch), **saveparams)
            print("Done.")

    except KeyboardInterrupt:
        pass
    print("Total training time = {}".format(time.time() - start))