Example #1
0
def do_cpu():
	C.renew_cg()
	W = C.parameter(cpW)
	W = W*W*W*W*W*W*W
	z = C.squared_distance(W,W)
	z.value()
	z.backward()
Example #2
0
def do_cpu():
  import _dynet as C
  C.init()
  cm = C.Model()
  cpW = cm.add_parameters((1000,1000))
  s = time.time()
  C.renew_cg()
  W = C.parameter(cpW)
  W = W*W*W*W*W*W*W
  z = C.squared_distance(W,W)
  z.value()
  z.backward()
  print("CPU time:",time.time() - s)
Example #3
0
def do_cpu():
    import _dynet as C
    C.init()
    cm = C.Model()
    cpW = cm.add_parameters((1000, 1000))
    s = time.time()
    C.renew_cg()
    W = C.parameter(cpW)
    W = W * W * W * W * W * W * W
    z = C.squared_distance(W, W)
    z.value()
    z.backward()
    print("CPU time:", time.time() - s)
Example #4
0
def do_gpu():
    import _dynet as G
    import sys
    sys.argv.append('--dynet-devices')
    sys.argv.append('GPU:0')
    G.init()
    gm = G.Model()
    gpW = gm.add_parameters((1000, 1000))
    s = time.time()
    G.renew_cg()
    W = G.parameter(gpW)
    W = W * W * W * W * W * W * W
    z = G.squared_distance(W, W)
    z.value()
    z.backward()
    print("GPU time:", time.time() - s)
Example #5
0
def do_gpu():
  import _dynet as G
  import sys 
  sys.argv.append('--dynet-devices')
  sys.argv.append('GPU:0')
  G.init()
  gm = G.Model()
  gpW = gm.add_parameters((1000,1000))
  s = time.time()
  G.renew_cg()
  W = G.parameter(gpW)
  W = W*W*W*W*W*W*W
  z = G.squared_distance(W,W)
  z.value()
  z.backward()
  print("GPU time:",time.time() - s)
Example #6
0
    def fit(self,
            train_X,
            train_Y,
            num_epochs,
            val_X=None,
            val_Y=None,
            patience=2,
            model_path=None,
            seed=None,
            word_dropout_rate=0.25,
            trg_vectors=None,
            unsup_weight=1.0,
            labeled_weight_proportion=1.0):
        """
        train the model
        :param trg_vectors: the prediction targets used for the unsupervised loss
                            in temporal ensembling
        :param unsup_weight: weight for the unsupervised consistency loss
                                    used in temporal ensembling
        """
        if seed:
            print(">>> using seed: ", seed, file=sys.stderr)
            random.seed(seed)  #setting random seed

        assert(train_X.shape[0] == len(train_Y)), \
            '# examples %d != # labels %d.' % (train_X.shape[0], len(train_Y))
        train_data = list(zip(train_X, train_Y))

        print('Starting training for %d epochs...' % num_epochs)
        best_val_f1, epochs_no_improvement = 0., 0
        if val_X is not None and val_Y is not None and model_path is not None:
            print('Using early stopping with patience of %d...' % patience)
        for cur_iter in range(num_epochs):
            bar = Bar('Training epoch %d/%d...' % (cur_iter + 1, num_epochs),
                      max=len(train_data),
                      flush=True)
            total_loss = 0.0

            random_indices = np.arange(len(train_data))
            random.shuffle(random_indices)

            for i, idx in enumerate(random_indices):

                x, y = train_data[idx]
                output = self.predict(x,
                                      train=True,
                                      dropout_rate=word_dropout_rate)
                # in temporal ensembling, we assign a dummy label of -1 for
                # unlabeled sequences; we skip the supervised loss for these
                loss = dynet.scalarInput(0) if y == -1 else self.pick_neg_log(
                    output, y)

                if trg_vectors is not None:
                    # the consistency loss in temporal ensembling is used for
                    # both supervised and unsupervised input
                    target = trg_vectors[idx]

                    other_loss = dynet.squared_distance(
                        output, dynet.inputVector(target))

                    if y != -1:
                        other_loss *= labeled_weight_proportion
                    loss += other_loss * unsup_weight
                total_loss += loss.value()

                loss.backward()
                self.trainer.update()
                bar.next()

            print(" iter {2} {0:>12}: {1:.2f}".format(
                "total loss", total_loss / len(train_data), cur_iter),
                  file=sys.stderr)

            if val_X is not None and val_Y is not None and model_path is not None:
                # get the best F1 score on the validation set
                val_f1 = self.evaluate(val_X, val_Y)

                if val_f1 > best_val_f1:
                    print('F1 %.4f is better than best val F1 %.4f.' %
                          (val_f1, best_val_f1))
                    best_val_f1 = val_f1
                    epochs_no_improvement = 0
                    save_model(self, model_path)
                else:
                    print('F1 %.4f is worse than best val F1 %.4f.' %
                          (val_f1, best_val_f1))
                    epochs_no_improvement += 1
                if epochs_no_improvement == patience:
                    print('No improvement for %d epochs. Early stopping...' %
                          epochs_no_improvement)
                    break
Example #7
0
    def fit(self,
            train_dict,
            num_epochs,
            val_X=None,
            val_Y=None,
            patience=2,
            model_path=None,
            seed=None,
            word_dropout_rate=0.25,
            trg_vectors=None,
            unsup_weight=1.0,
            orthogonality_weight=0.0,
            adversarial=False):
        """
        train the model
        :param trg_vectors: the prediction targets used for the unsupervised loss
                            in temporal ensembling
        :param unsup_weight: weight for the unsupervised consistency loss
                                    used in temporal ensembling
        """
        if seed:
            print(">>> using seed: ", seed, file=sys.stderr)
            random.seed(seed)  #setting random seed

        train_data = []
        for task, task_dict in train_dict.items():
            for key in ["X", "Y", "domain"]:
                assert key in task_dict, "Error: %s is not available." % key
            examples, labels, domain_tags = task_dict["X"], task_dict["Y"], \
                                            task_dict["domain"]
            assert examples.shape[0] == len(labels)

            # train data is a list of 4-tuples: (example, label, task_id, domain_id)
            train_data += list(
                zip(examples, labels, [[task] * len(labels)][0], domain_tags))

        print('Starting training for %d epochs...' % num_epochs)
        best_val_f1, epochs_no_improvement = 0., 0

        if val_X is not None and val_Y is not None and model_path is not None:
            print('Using early stopping with patience of %d...' % patience)

        for cur_iter in range(num_epochs):
            bar = Bar('Training epoch %d/%d...' % (cur_iter + 1, num_epochs),
                      max=len(train_data),
                      flush=True)
            total_loss, total_constraint, total_adversarial = 0.0, 0.0, 0.0

            random_indices = np.arange(len(train_data))
            random.shuffle(random_indices)

            for i, idx in enumerate(random_indices):

                x, y, task_id, domain_id = train_data[idx]
                task_ids = [task_id]

                if task_id == 'src':
                    # we train both F0 and F1 on source data
                    task_ids = ['F0', 'F1']
                elif task_id == 'src_all':
                    # we train F0, F1, and Ft on source data for base training
                    task_ids = ['F0', 'F1', 'Ft']

                loss = 0
                outputs, constraint, adv = self.predict(
                    x,
                    task_ids,
                    train=True,
                    dropout_rate=word_dropout_rate,
                    orthogonality_weight=orthogonality_weight,
                    domain_id=domain_id if adversarial else None)

                # in temporal ensembling, we assign a dummy label of -1 for
                # unlabeled sequences; we skip the supervised loss for these
                for output in outputs:
                    loss += dynet.scalarInput(
                        0) if y == -1 else self.pick_neg_log(output, y)

                    if trg_vectors is not None:
                        # the consistency loss in temporal ensembling is used for
                        # both supervised and unsupervised input
                        target = trg_vectors[idx]

                        other_loss = dynet.squared_distance(
                            output, dynet.inputVector(target))
                        loss += other_loss * unsup_weight

                # the orthogonality weight is the same for every prediction,
                # so we can add it in the end
                if orthogonality_weight != 0.0:
                    # add the orthogonality constraint to the loss
                    loss += constraint * orthogonality_weight
                    total_constraint += constraint.value()
                if adversarial:
                    total_adversarial += adv.value()
                    loss += adv

                total_loss += loss.value()
                loss.backward()
                self.trainer.update()
                bar.next()

            print(
                "\niter {}. Total loss: {:.3f}, total penalty: {:.3f}, adv: {:.3f}"
                .format(cur_iter, total_loss / len(train_data),
                        total_constraint / len(train_data),
                        total_adversarial / len(train_data)),
                file=sys.stderr)

            if val_X is not None and val_Y is not None and model_path is not None:
                # get the best F1 score on the validation set
                val_f1 = self.evaluate(val_X, val_Y, 'F0')

                if val_f1 > best_val_f1:
                    print('F1 %.4f is better than best val F1 %.4f.' %
                          (val_f1, best_val_f1))
                    best_val_f1 = val_f1
                    epochs_no_improvement = 0
                    save_mttri_model(self, model_path)
                else:
                    print('F1 %.4f is worse than best val F1 %.4f.' %
                          (val_f1, best_val_f1))
                    epochs_no_improvement += 1
                if epochs_no_improvement == patience:
                    print('No improvement for %d epochs. Early stopping...' %
                          epochs_no_improvement)
                    break
Example #8
0
    def fit(self,
            train_X,
            train_Y,
            num_epochs,
            val_X=None,
            val_Y=None,
            patience=2,
            model_path=None,
            seed=None,
            word_dropout_rate=0.25,
            trg_vectors=None,
            unsup_weight=1.0,
            variance_weights=None,
            labeled_weight_proportion=1.0):
        """
        train the tagger
        :param trg_vectors: the prediction targets used for the unsupervised loss
                            in temporal ensembling
        :param unsup_weight: weight for the unsupervised consistency loss
                                    used in temporal ensembling
        :param clip_threshold: use gradient clipping with threshold (on if >0; default: 5.0)
        :param labeled_weight_proportion: proportion of the unsupervised weight
                                          that should be assigned to labeled
                                          examples
        """
        print("read training data", file=sys.stderr)

        if variance_weights is not None:
            print('First 20 variance weights:', variance_weights[:20])

        if seed:
            print(">>> using seed: ", seed, file=sys.stderr)
            random.seed(seed)  #setting random seed

        # if we use word dropout keep track of counts
        if word_dropout_rate > 0.0:
            widCount = Counter()
            for sentence, _ in train_X:
                widCount.update([w for w in sentence])

        assert (len(train_X) == len(train_Y))
        train_data = list(zip(train_X, train_Y))

        # if we use target vectors, keep track of the targets per sentence
        if trg_vectors is not None:
            trg_start_id = 0
            sentence_trg_vectors = []
            sentence_var_weights = []
            for i, (example, y) in enumerate(train_data):
                sentence_trg_vectors.append(
                    trg_vectors[trg_start_id:trg_start_id +
                                len(example[0]), :])
                if variance_weights is not None:
                    sentence_var_weights.append(
                        variance_weights[trg_start_id:trg_start_id +
                                         len(example[0])])
                trg_start_id += len(example[0])
            assert trg_start_id == len(trg_vectors),\
                'Error: Idx {} is not at {}.'.format(trg_start_id, len(trg_vectors))
            assert len(sentence_trg_vectors) == len(train_X)
            if variance_weights is not None:
                assert trg_start_id == len(variance_weights)
                assert len(sentence_var_weights) == len(train_X)

        print('Starting training for {} epochs...'.format(num_epochs))
        best_val_acc, epochs_no_improvement = 0., 0
        if val_X is not None and val_Y is not None and model_path is not None:
            print(
                'Using early stopping with patience of {}...'.format(patience))

        for cur_iter in range(num_epochs):
            bar = Bar('Training epoch {}/{}...'.format(cur_iter + 1,
                                                       num_epochs),
                      max=len(train_data),
                      flush=True)
            total_loss = 0.0
            total_tagged = 0.0

            total_other_loss, total_other_loss_weighted = 0.0, 0.0

            random_indices = np.arange(len(train_data))
            random.shuffle(random_indices)

            for i, idx in enumerate(random_indices):
                (word_indices, char_indices), y = train_data[idx]

                if word_dropout_rate > 0.0:
                    word_indices = [
                        self.w2i["_UNK"] if
                        (random.random() >
                         (widCount.get(w) /
                          (word_dropout_rate + widCount.get(w)))) else w
                        for w in word_indices
                    ]
                output = self.predict(word_indices, char_indices, train=True)

                if len(y) == 1 and y[0] == 0:
                    # in temporal ensembling, we assign a dummy label of [0] for
                    # unlabeled sequences; we skip the supervised loss for these
                    loss = dynet.scalarInput(0)
                else:
                    loss = dynet.esum([
                        self.pick_neg_log(pred, gold)
                        for pred, gold in zip(output, y)
                    ])

                if trg_vectors is not None:
                    # the consistency loss in temporal ensembling is used for
                    # both supervised and unsupervised input
                    targets = sentence_trg_vectors[idx]
                    assert len(output) == len(targets)
                    if variance_weights is not None:
                        var_weights = sentence_var_weights[idx]
                        assert len(output) == len(var_weights)
                        # multiply the normalized mean variance with each loss
                        other_loss = dynet.esum([
                            v * dynet.squared_distance(o, dynet.inputVector(t))
                            for o, t, v in zip(output, targets, var_weights)
                        ])
                    else:
                        other_loss = dynet.esum([
                            dynet.squared_distance(o, dynet.inputVector(t))
                            for o, t in zip(output, targets)
                        ])

                    total_other_loss += other_loss.value()
                    if len(y) == 1 and y[0] == 0:  #unlab_ex
                        other_loss += other_loss * unsup_weight
                    else:  #lab_ex
                        # assign the unsupervised weight for labeled examples
                        other_loss += other_loss * unsup_weight * labeled_weight_proportion
                    # keep track for logging
                    total_loss += loss.value()  # main loss
                    total_tagged += len(word_indices)
                    total_other_loss_weighted += other_loss.value()

                    # combine losses
                    loss += other_loss

                else:
                    # keep track for logging
                    total_loss += loss.value()
                    total_tagged += len(word_indices)

                loss.backward()
                self.trainer.update()
                bar.next()

            if trg_vectors is None:
                print("iter {2} {0:>12}: {1:.2f}".format(
                    "total loss", total_loss / total_tagged, cur_iter),
                      file=sys.stderr)
            else:
                print(
                    "iter {2} {0:>12}: {1:.2f} unsupervised loss: {3:.2f} (weighted: {4:.2f})"
                    .format("supervised loss", total_loss / total_tagged,
                            cur_iter, total_other_loss / total_tagged,
                            total_other_loss_weighted / total_tagged),
                    file=sys.stderr)

            if val_X is not None and val_Y is not None and model_path is not None:
                # get the best accuracy on the validation set
                val_correct, val_total = self.evaluate(val_X, val_Y)
                val_accuracy = val_correct / val_total

                if val_accuracy > best_val_acc:
                    print(
                        'Accuracy {:.4f} is better than best val accuracy {:.4f}'
                        .format(val_accuracy, best_val_acc))
                    best_val_acc = val_accuracy
                    epochs_no_improvement = 0
                    save_tagger(self, model_path)
                else:
                    print(
                        'Accuracy {:.4f} is worse than best val loss {:.4f}.'.
                        format(val_accuracy, best_val_acc))
                    epochs_no_improvement += 1
                if epochs_no_improvement == patience:
                    print('No improvement for {} epochs. Early stopping...'.
                          format(epochs_no_improvement))
                    break
Example #9
0
    def fit(self,
            train_dict,
            num_epochs,
            val_X=None,
            val_Y=None,
            patience=2,
            model_path=None,
            seed=None,
            word_dropout_rate=0.25,
            trg_vectors=None,
            unsup_weight=1.0,
            clip_threshold=5.0,
            orthogonality_weight=0.0,
            adversarial=False,
            adversarial_weight=1.0,
            ignore_src_Ft=False):
        """
        train the tagger
        :param trg_vectors: the prediction targets used for the unsupervised loss
                            in temporal ensembling
        :param unsup_weight: weight for the unsupervised consistency loss
                                    used in temporal ensembling
        :param adversarial: note: if we want to use adversarial, we have to
                            call add_adversarial_loss before;
        :param adversarial_weight: 1 by default (do not weigh adv loss)
        :param ignore_src_Ft: if asymm.tri. 2nd stage, do not further train Ft on 'src'
        :param train_dict: a dictionary mapping tasks ("F0", "F1", and "Ft")
                           to a dictionary
                           {"X": list of examples,
                            "Y": list of labels,
                            "domain": list of domain tag (0,1) of example}
        Three tasks are indexed as "F0", "F1" and "Ft"

        Note: if a task 'src' is given than a single model with three heads is trained where
        all data is given to all outputs
        """
        print("read training data")

        widCount = Counter()
        train_data = []
        for task, task_dict in train_dict.items():  #task: eg. "F0"
            for key in ["X", "Y", "domain"]:
                assert key in task_dict, "Error: %s is not available." % key
            examples, labels, domain_tags = task_dict["X"], task_dict[
                "Y"], task_dict["domain"]
            assert len(examples) == len(labels)
            if word_dropout_rate > 0.0:
                # keep track of the counts for word dropout
                for sentence, _ in examples:
                    widCount.update([w for w in sentence])

            # train data is a list of 4-tuples: (example, label, task_id, domain_id)
            train_data += list(
                zip(examples, labels, [[task] * len(labels)][0], domain_tags))

        # if we use target vectors, keep track of the targets per sentence
        if trg_vectors is not None:
            trg_start_id = 0
            sentence_trg_vectors = []
            for i, (example, y) in enumerate(train_data):
                sentence_trg_vectors.append(
                    trg_vectors[trg_start_id:trg_start_id +
                                len(example[0]), :])
                trg_start_id += len(example[0])
            assert trg_start_id == len(trg_vectors),\
                'Error: Idx {} is not at {}.'.format(trg_start_id, len(trg_vectors))

        print('Starting training for {} epochs...'.format(num_epochs))
        best_val_acc, epochs_no_improvement = 0., 0
        if val_X is not None and val_Y is not None and model_path is not None:
            print(
                'Using early stopping with patience of {}...'.format(patience))

        if seed:
            random.seed(seed)

        for cur_iter in range(num_epochs):
            bar = Bar('Training epoch {}/{}...'.format(cur_iter + 1,
                                                       num_epochs),
                      max=len(train_data),
                      flush=True)

            random_indices = np.arange(len(train_data))
            random.shuffle(random_indices)

            total_loss, total_tagged, total_constraint, total_adversarial = 0.0, 0.0, 0.0, 0.0
            total_orth_constr = 0  # count how many updates

            # log separate losses
            log_losses = {}
            log_total = {}
            for task_id in self.task_ids:
                log_losses[task_id] = 0.0
                log_total[task_id] = 0

            for i, idx in enumerate(random_indices):
                (word_indices,
                 char_indices), y, task_id, domain_id = train_data[idx]

                if word_dropout_rate > 0.0:
                    word_indices = [
                        self.w2i["_UNK"] if
                        (random.random() >
                         (widCount.get(w) /
                          (word_dropout_rate + widCount.get(w)))) else w
                        for w in word_indices
                    ]

                output, constraint, adv = self.predict(
                    word_indices,
                    char_indices,
                    task_id,
                    train=True,
                    orthogonality_weight=orthogonality_weight,
                    domain_id=domain_id if adversarial else None)

                if task_id not in ['src', 'trg']:

                    if len(y) == 1 and y[0] == 0:
                        # in temporal ensembling, we assign a dummy label of [0] for
                        # unlabeled sequences; we skip the supervised loss for these
                        loss = dynet.scalarInput(0)
                    else:
                        loss = dynet.esum([
                            self.pick_neg_log(pred, gold)
                            for pred, gold in zip(output, y)
                        ])

                    if trg_vectors is not None:
                        # the consistency loss in temporal ensembling is used for
                        # both supervised and unsupervised input
                        targets = sentence_trg_vectors[idx]
                        assert len(output) == len(targets)
                        other_loss = unsup_weight * dynet.average([
                            dynet.squared_distance(o, dynet.inputVector(t))
                            for o, t in zip(output, targets)
                        ])
                        loss += other_loss

                    if orthogonality_weight != 0.0 and task_id != 'Ft':
                        # add the orthogonality constraint to the loss
                        total_constraint += constraint.value(
                        ) * orthogonality_weight
                        total_orth_constr += 1
                        loss += constraint * orthogonality_weight

                    if adversarial:
                        total_adversarial += adv.value() * adversarial_weight
                        loss += adv * adversarial_weight

                    total_loss += loss.value()  # for output

                    log_losses[task_id] += total_loss
                    total_tagged += len(word_indices)
                    log_total[task_id] += total_tagged

                    loss.backward()
                    self.trainer.update()
                    bar.next()
                else:
                    # bootstrap=False, the output contains list of outputs one for each task
                    assert trg_vectors is None, 'temporal ensembling not implemented for bootstrap=False'
                    loss = dynet.scalarInput(1)  #initialize
                    if ignore_src_Ft:
                        output = output[:
                                        -1]  # ignore last = Ft when further training with 'src'

                    for t_i, output_t in enumerate(
                            output):  # get loss for each task
                        loss += dynet.esum([
                            self.pick_neg_log(pred, gold)
                            for pred, gold in zip(output_t, y)
                        ])
                        task_id = self.task_ids[t_i]
                        log_losses[task_id] += total_loss
                        log_total[task_id] += total_tagged

                    if orthogonality_weight != 0.0:
                        # add the orthogonality constraint to the loss
                        total_constraint += constraint.value(
                        ) * orthogonality_weight
                        total_orth_constr += 1
                        loss += constraint * orthogonality_weight

                    if adversarial:
                        total_adversarial += adv.value() * adversarial_weight
                        loss += adv * adversarial_weight

                    total_loss += loss.value()  # for output
                    total_tagged += len(word_indices)

                    loss.backward()
                    self.trainer.update()
                    bar.next()

            if adversarial and orthogonality_weight:
                print(
                    "iter {}. Total loss: {:.3f}, total penalty: {:.3f}, total weighted adv loss: {:.3f}"
                    .format(cur_iter, total_loss / total_tagged,
                            total_constraint / total_orth_constr,
                            total_adversarial / total_tagged),
                    file=sys.stderr)
            elif orthogonality_weight:
                print("iter {}. Total loss: {:.3f}, total penalty: {:.3f}".
                      format(cur_iter, total_loss / total_tagged,
                             total_constraint / total_orth_constr),
                      file=sys.stderr)
            else:
                print("iter {}. Total loss: {:.3f} ".format(
                    cur_iter, total_loss / total_tagged),
                      file=sys.stderr)

            for task_id in self.task_ids:
                if log_total[task_id] > 0:
                    print("{0}: {1:.3f}".format(
                        task_id, log_losses[task_id] / log_total[task_id]))

            if val_X is not None and val_Y is not None and model_path is not None:
                # get the best accuracy on the validation set
                val_correct, val_total = self.evaluate(val_X, val_Y)
                val_accuracy = val_correct / val_total

                if val_accuracy > best_val_acc:
                    print(
                        'Accuracy {:.4f} is better than best val accuracy {:.4f}.'
                        .format(val_accuracy, best_val_acc))
                    best_val_acc = val_accuracy
                    epochs_no_improvement = 0
                    save_tagger(self, model_path)
                else:
                    print(
                        'Accuracy {:.4f} is worse than best val loss {:.4f}.'.
                        format(val_accuracy, best_val_acc))
                    epochs_no_improvement += 1
                if epochs_no_improvement == patience:
                    print('No improvement for {} epochs. Early stopping...'.
                          format(epochs_no_improvement))
                    break
Example #10
0
 def eval_loss(self, case_x, case_class):
     output = self.evaluate_network(case_x)
     y_param = dy.vecInput(2)
     y_param.set(one_hot(2,case_class))
     loss = dy.squared_distance(output, y_param)
     return loss