def translate_maxibatch(maxibatch,
                            num_to_target,
                            num_prev_translated,
                            mask=0):
        """Translates an individual maxibatch.

        Args:
            maxibatch: a list of sentences.
            num_to_target: dictionary mapping target vocabulary IDs to strings.
            num_prev_translated: the number of previously translated sentences.
        """

        # Sort the maxibatch by length and split into minibatches.
        try:
            minibatches, idxs = util.read_all_lines(config, maxibatch,
                                                    minibatch_size)
        except exception.Error as x:
            logging.error(x.msg)
            sys.exit(1)

        # Translate the minibatches and store the resulting beam (i.e.
        # translations and scores) for each sentence.
        beams = []
        for x in minibatches:
            y_dummy = numpy.zeros(shape=(len(x), 1))
            x, x_mask, _, _ = util.prepare_data(x,
                                                y_dummy,
                                                config.factors,
                                                maxlen=None)
            sample = translate_batch(session, sampler, x, x_mask,
                                     max_translation_len, normalization_alpha)
            beams.extend(sample)
            num_translated = num_prev_translated + len(beams)
            logging.info('Translated {} sents'.format(num_translated))

        # Put beams into the same order as the input maxibatch.
        tmp = numpy.array(beams, dtype=numpy.object)
        ordered_beams = tmp[idxs.argsort()]

        # Write the translations to the output file.
        for i, beam in enumerate(ordered_beams):
            if nbest:
                num = num_prev_translated + i
                for sent, cost in beam:
                    translation = util.seq2words(sent, num_to_target)
                    line = "{} ||| {} ||| {}\n".format(num, translation,
                                                       str(cost))
                    output_file.write(line)
            else:
                best_hypo, cost = beam[0]
                # print(best_hypo)
                eos_idx = list(best_hypo).index(0) if 0 in best_hypo else len(
                    best_hypo)
                best_hypo = best_hypo[:eos_idx]
                best_hypo = best_hypo[:len(best_hypo) -
                                      mask] if len(best_hypo) > mask else []
                best_hypo = list(best_hypo) + [0]
                # print(best_hypo)
                line = util.seq2words(best_hypo, num_to_target) + '\n'
                output_file.write(line)
def main():
    # prepare data
    train, valid = util.prepare_data()

    # get model
    model = get_model()
    model.summary()

    # generators for training and validation
    BATCH = 128
    train_gen = util.next_train_batch(train, BATCH)
    valid_gen = util.next_valid_batch(valid, BATCH)

    # training
    EPOCHS = 5
    TRAINS = 20480
    VALIDS = 4096
    model.compile(optimizer=Adam(1e-2), loss="mse")
    history = model.fit_generator(train_gen,
                                  samples_per_epoch=TRAINS,
                                  nb_epoch=EPOCHS,
                                  validation_data=valid_gen,
                                  nb_val_samples=VALIDS,
                                  verbose=1)

    # save model, weights
    model.save_weights('model.h5')
    with open('model.json', 'w') as f:
        f.write(model.to_json())
Exemple #3
0
    def _translate(self, process_id, input_item, get_sampler, sess):
        """
        Actual translation (model sampling).
        """

        # unpack input item attributes
        k = input_item.k
        x = input_item.batch
        alpha = input_item.normalization_alpha
        #max_ratio = input_item.max_ratio

        y_dummy = numpy.zeros(shape=(len(x),1))
        x, x_mask, _, _ = util.prepare_data(x, y_dummy,
                                            self._options[0].factors,
                                            maxlen=None)

        sample = translate_utils.translate_batch(
            session=sess,
            sampler=get_sampler(k),
            x=x,
            x_mask=x_mask,
            max_translation_len=self._options[0].translation_maxlen,
            normalization_alpha=alpha)

        return sample
Exemple #4
0
def minimal_steiner_vary_with_keywords_num(times=20):
    '''
    测试minimal steiner tree算法在查找不同个数的关键词下返回的查询结果个数和运行时间的差别
    :return:
    '''
    graph, categories, category_list = prepare_data()

    count_options = [2, 3, 4, 5, 6]
    num_of_options = len(count_options)

    num_nodes = np.zeros((num_of_options, times))
    costs = np.zeros((num_of_options, times))

    dict = {}

    for (i, count) in enumerate(count_options):
        for j in range(times):
            # keywords = generateNeighborKeywords(graph, categories, count, i*10 + j ) #seed = j
            keywords = generateKeywords(categories, count, i * 100 + j)
            num_nodes[i, j], costs[i, j] = testWithMinimalSteinerTree(
                graph, categories, keywords)
            if (j + 1) % 10 == 0:
                print('>')
            else:
                print('>', end='')
        print('==================%dkeyword finshed===============' % count)

    dict['num_nodes'] = num_nodes.tolist()
    dict['costs'] = costs.tolist()

    with open(
            '../outputs/minmal_steiner_vary_with_random_neighbor_keywords.json',
            'w') as f:
        json.dump(dict, f)
Exemple #5
0
def calc_loss_per_sentence(config, sess, text_iterator, model,
                           normalization_alpha=0):
    losses = []
    loss_per_sentence = model.get_loss()
    for x_v, y_v in text_iterator:
        if len(x_v[0][0]) != config.factors:
            logging.error('Mismatch between number of factors in settings ({0}), and number in validation corpus ({1})\n'.format(config.factors, len(x_v[0][0])))
            sys.exit(1)
        x, x_mask, y, y_mask = util.prepare_data(x_v, y_v, config.factors,
                                                 maxlen=None)
        feeds = {model.inputs.x: x,
                 model.inputs.x_mask: x_mask,
                 model.inputs.y: y,
                 model.inputs.y_mask: y_mask,
                 model.inputs.training: False}
        loss_per_sentence_out = sess.run(loss_per_sentence, feed_dict=feeds)

        # normalize scores according to output length
        if normalization_alpha:
            adjusted_lengths = numpy.array([numpy.count_nonzero(s) ** normalization_alpha for s in y_v_mask_in.T])
            loss_per_sentence_out /= adjusted_lengths

        losses += list(loss_per_sentence_out)
        logging.info( "Seen {0}".format(len(losses)))
    return losses
Exemple #6
0
def main():
    torch.manual_seed(0)

    # データ生成
    w_true = torch.tensor([1, 2, 3], dtype=torch.float)
    N = 100
    X, y = prepare_data(N, w_true)

    # 重みの初期化 requires_grad=Trueにすると計算グラフ保持→逆伝播の計算ができる
    w = torch.randn(w_true.size(0), requires_grad=True)

    # 学習におけるハイパーパラメータ
    learning_rate = 0.1
    num_epochs = 20
    loss_list = []

    for epoch in range(1, num_epochs + 1):
        w.grad = None

        y_pred = torch.mv(X, w)
        loss = torch.mean((y_pred - y)**2)
        loss.backward()
        loss_list.append(loss)

        # print(w.grad)
        w.data = w - learning_rate * w.grad.data
        print(
            f'Epoch{epoch}: loss={loss.item():.4f} w={w.data} dL/dw={w.grad.data}'
        )

    plot_loss(loss_list)
Exemple #7
0
def compare_vary_with_keywords_num(times=50):
    '''
    测试minimal steiner tree算法在查找不同个数的关键词下返回的查询结果个数和运行时间的差别
    :return:
    '''
    graph, categories, category_list = prepare_data()

    count_options = [2, 3, 4, 5, 6]
    num_of_options = len(count_options)

    num_nodes = np.zeros((3, num_of_options, times))
    costs = np.zeros((3, num_of_options, times))

    dict = {}

    for (i, count) in enumerate(count_options):
        for j in range(times):
            keywords = generateKeywords(category_list, count,
                                        i * 10 + j)  #seed = j
            num_nodes[0, i, j], costs[0, i, j] = testWithMinimalSteinerTree(
                graph, categories, keywords)
            num_nodes[1, i, j], costs[1, i, j] = testWithRandomSteinerTree(
                graph, categories, keywords)
            num_nodes[2, i, j], costs[2, i, j] = testWithGreedySteinerTree(
                graph, categories, keywords)

            if (j + 1) % 10 == 0:
                print('>')
            else:
                print('>', end='')
        print('==================%dkeyword finshed===============' % count)

    print(np.average(num_nodes, axis=2))
    print(np.average(costs, axis=2))
def main():
    torch.manual_seed(0)

    # データ生成
    w_true = torch.tensor([1, 2, 3], dtype=torch.float)
    N = 100
    X, y = prepare_data(N, w_true)

    # モデル構築
    model = nn.Linear(in_features=3, out_features=1, bias=False)
    criterion = nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    # print(list(model.parameters()))   # 重みは勝手に設定してくれる
    num_epochs = 10
    loss_list = []

    for epoch in range(1, num_epochs + 1):
        optimizer.zero_grad()

        y_pred = model(X)
        loss = criterion(y_pred.view_as(y), y)
        loss.backward()
        loss_list.append(loss.item())

        optimizer.step()

    # plt.plot(y)
    # plt.plot(model(X).detach().numpy())
    plot_loss(loss_list)
Exemple #9
0
def calc_cross_entropy_per_sentence(session, model, config, text_iterator,
                                    normalization_alpha=0.0):
    """Calculates cross entropy values for a parallel corpus.

    By default (when normalization_alpha is 0.0), the sentence-level cross
    entropy is calculated. If normalization_alpha is 1.0 then the per-token
    cross entropy is calculated. Other values of normalization_alpha may be
    useful if the cross entropy value will be used as a score for selecting
    between translation candidates (e.g. in reranking an n-nbest list). Using
    a different (empirically determined) alpha value can help correct a model
    bias toward too-short / too-long sentences.

    TODO Support for multiple GPUs

    Args:
        session: TensorFlow session.
        model: a RNNModel object.
        config: model config.
        text_iterator: TextIterator.
        normalization_alpha: length normalization hyperparameter.

    Returns:
        A pair of lists. The first contains the (possibly normalized) cross
        entropy value for each sentence pair. The second contains the
        target-side token count for each pair (including the terminating
        <EOS> symbol).
    """
    ce_vals, token_counts = [], []
    for xx, yy in text_iterator:
        if len(xx[0][0]) != config.factors:
            logging.error('Mismatch between number of factors in settings ' \
                          '({0}) and number present in data ({1})'.format(
                          config.factors, len(xx[0][0])))
            sys.exit(1)
        x, x_mask, y, y_mask = util.prepare_data(xx, yy, config.factors,
                                                 maxlen=None)

        # Run the minibatch through the model to get the sentence-level cross
        # entropy values.
        feeds = {model.inputs.x: x,
                 model.inputs.x_mask: x_mask,
                 model.inputs.y: y,
                 model.inputs.y_mask: y_mask,
                 model.inputs.training: False}
        batch_ce_vals = session.run(model.loss_per_sentence, feed_dict=feeds)

        # Optionally, do length normalization.
        batch_token_counts = [numpy.count_nonzero(s) for s in y_mask.T]
        if normalization_alpha:
            adjusted_lens = [n**normalization_alpha for n in batch_token_counts]
            batch_ce_vals /= numpy.array(adjusted_lens)

        ce_vals += list(batch_ce_vals)
        token_counts += batch_token_counts
        logging.info("Seen {}".format(len(ce_vals)))

    assert len(ce_vals) == len(token_counts)
    return ce_vals, token_counts
Exemple #10
0
def calc_cross_entropy_per_sentence(session, model, config, text_iterator,
                                    normalization_alpha=0.0):
    """Calculates cross entropy values for a parallel corpus.

    By default (when normalization_alpha is 0.0), the sentence-level cross
    entropy is calculated. If normalization_alpha is 1.0 then the per-token
    cross entropy is calculated. Other values of normalization_alpha may be
    useful if the cross entropy value will be used as a score for selecting
    between translation candidates (e.g. in reranking an n-nbest list). Using
    a different (empirically determined) alpha value can help correct a model
    bias toward too-short / too-long sentences.

    TODO Support for multiple GPUs

    Args:
        session: TensorFlow session.
        model: a RNNModel object.
        config: model config.
        text_iterator: TextIterator.
        normalization_alpha: length normalization hyperparameter.

    Returns:
        A pair of lists. The first contains the (possibly normalized) cross
        entropy value for each sentence pair. The second contains the
        target-side token count for each pair (including the terminating
        <EOS> symbol).
    """
    ce_vals, token_counts = [], []
    for xx, yy in text_iterator:
        if len(xx[0][0]) != config.factors:
            logging.error('Mismatch between number of factors in settings ' \
                          '({0}) and number present in data ({1})'.format(
                          config.factors, len(xx[0][0])))
            sys.exit(1)
        x, x_mask, y, y_mask = util.prepare_data(xx, yy, config.factors,
                                                 maxlen=None)

        # Run the minibatch through the model to get the sentence-level cross
        # entropy values.
        feeds = {model.inputs.x: x,
                 model.inputs.x_mask: x_mask,
                 model.inputs.y: y,
                 model.inputs.y_mask: y_mask,
                 model.inputs.training: False}
        batch_ce_vals = session.run(model.loss_per_sentence, feed_dict=feeds)

        # Optionally, do length normalization.
        batch_token_counts = [numpy.count_nonzero(s) for s in y_mask.T]
        if normalization_alpha:
            adjusted_lens = [n**normalization_alpha for n in batch_token_counts]
            batch_ce_vals /= numpy.array(adjusted_lens)

        ce_vals += list(batch_ce_vals)
        token_counts += batch_token_counts
        logging.info("Seen {}".format(len(ce_vals)))

    assert len(ce_vals) == len(token_counts)
    return ce_vals, token_counts
Exemple #11
0
def reuse_encoders():
    counter = 0
    train_x, train_y, test_x, test_y = util.prepare_data()

    for k, f in encoder_funcs.items():
        print('Start reusing the ' + k + ' autoencoder!')
        # train the model
        train_and_evaluate(k, train_x, train_y, test_x, test_y)
        sys.exit(0)
Exemple #12
0
    def translate_maxibatch(maxibatch, model_set, num_to_target,
                            num_prev_translated):
        """Translates an individual maxibatch.

        Args:
            maxibatch: a list of sentences.
            model_set: an InferenceModelSet object.
            num_to_target: dictionary mapping target vocabulary IDs to strings.
            num_prev_translated: the number of previously translated sentences.
        """

        # Sort the maxibatch by length and split into minibatches.
        try:
            pre_minibatches, minibatches, idxs = util.read_all_lines(
                configs[0], maxibatch, minibatch_size)
        except exception.Error as x:
            logging.error(x.msg)
            sys.exit(1)

        # Translate the minibatches and store the resulting beam (i.e.
        # translations and scores) for each sentence.
        beams = []
        for px, x in zip(pre_minibatches, minibatches):
            y_dummy = numpy.zeros(shape=(len(x), 1))
            px, x, x_mask, _, _ = util.prepare_data(x,
                                                    y_dummy,
                                                    configs[0].factors,
                                                    px,
                                                    maxlen=None)
            sample = model_set.decode(session=session,
                                      px=px,
                                      x=x,
                                      x_mask=x_mask,
                                      beam_size=beam_size,
                                      normalization_alpha=normalization_alpha)
            beams.extend(sample)
            num_translated = num_prev_translated + len(beams)
            logging.info('Translated {} sents'.format(num_translated))

        # Put beams into the same order as the input maxibatch.
        tmp = numpy.array(beams, dtype=numpy.object)
        ordered_beams = tmp[idxs.argsort()]

        # Write the translations to the output file.
        for i, beam in enumerate(ordered_beams):
            if nbest:
                num = num_prev_translated + i
                for sent, cost in beam:
                    translation = util.seq2words(sent, num_to_target)
                    line = "{} ||| {} ||| {}\n".format(num, translation,
                                                       str(cost))
                    output_file.write(line)
            else:
                best_hypo, cost = beam[0]
                line = util.seq2words(best_hypo, num_to_target) + '\n'
                output_file.write(line)
Exemple #13
0
    def forward(self, data):
        data = prepare_data(data)
        full_p_states, p_mask, full_q_states, q_mask = self.encode(data)
        logits1, logits2, has_log = self.decode(full_p_states, p_mask,
                                                full_q_states, q_mask)
        loss = compute_loss(logits1, logits2, has_log, data['y1'], data['y2'],
                            data['has_ans'])

        self.train_loss.update(loss.data.item())
        del full_p_states, p_mask, full_q_states, q_mask, logits1, logits2, has_log
        return loss
Exemple #14
0
    def SelfEvaluate(self,
                     batches,
                     eval_file=None,
                     answer_file=None,
                     drop_file=None,
                     dev=None):
        print('Starting evaluation')

        with open(eval_file, 'r', encoding='utf-8') as f:
            eval_file = json.load(f)
        with open(dev, 'r', encoding='utf-8') as f:
            dev = json.load(f)

        answer_dict = {}
        mapped_dict = {}

        for batch in batches:
            data = prepare_data(batch)
            full_p_states, p_mask, full_q_states, q_mask = self.encode(data)
            logits1, logits2, ans_log = self.decode(full_p_states, p_mask,
                                                    full_q_states, q_mask)
            y1, y2, has_ans = get_predictions(logits1, logits2, ans_log)
            qa_id = data['id']
            answer_dict_, mapped_dict_ = convert_tokens(
                eval_file, qa_id, y1, y2, has_ans)
            answer_dict.update(answer_dict_)
            mapped_dict.update(mapped_dict_)

            del full_p_states, p_mask, full_q_states, q_mask, y1, y2, answer_dict_, mapped_dict_, has_ans, ans_log, logits1, logits2

        with open(drop_file, 'r', encoding='utf-8') as f:
            drop = json.load(f)
        for i in drop['drop_ids']:
            uuid = eval_file[str(i)]["uuid"]
            answer_dict[str(i)] = ''
            mapped_dict[uuid] = ''

        with open(answer_file, 'w', encoding='utf-8') as f:
            json.dump(mapped_dict, f)
        metrics = evaluate(dev, mapped_dict)

        # sub_path = join('./result/', "submit.csv")
        # #log.info('Writing submission file to {}...'.format(sub_path))
        # with open(sub_path, 'w') as csv_fh:
        #     csv_writer = csv.writer(csv_fh, delimiter=',')
        #     csv_writer.writerow(['Id', 'Predicted'])
        #     for uuid in sorted(mapped_dict):
        #         csv_writer.writerow([uuid, mapped_dict[uuid]])

        print("EM: {}, F1: {}, Has answer: {}, No answer: {}".format(
            metrics['exact'], metrics['f1'], metrics['HasAns_f1'],
            metrics['NoAns_f1']))

        return metrics['exact'], metrics['f1']
Exemple #15
0
    def translate_maxibatch(maxibatch, model_set, num_to_target,
                            num_prev_translated):
        """Translates an individual maxibatch.

        Args:
            maxibatch: a list of sentences.
            model_set: an InferenceModelSet object.
            num_to_target: dictionary mapping target vocabulary IDs to strings.
            num_prev_translated: the number of previously translated sentences.
        """

        # Sort the maxibatch by length and split into minibatches.
        try:
            minibatches, idxs = util.read_all_lines(configs[0], maxibatch,
                                                    minibatch_size)
        except exception.Error as x:
            logging.error(x.msg)
            sys.exit(1)

        # Translate the minibatches and store the resulting beam (i.e.
        # translations and scores) for each sentence.
        beams = []
        for x in minibatches:
            y_dummy = numpy.zeros(shape=(len(x),1))
            x, x_mask, _, _ = util.prepare_data(x, y_dummy, configs[0].factors,
                                                maxlen=None)
            sample = model_set.beam_search(
                session=session,
                x=x,
                x_mask=x_mask,
                beam_size=beam_size,
                normalization_alpha=normalization_alpha)
            beams.extend(sample)
            num_translated = num_prev_translated + len(beams)
            logging.info('Translated {} sents'.format(num_translated))

        # Put beams into the same order as the input maxibatch.
        tmp = numpy.array(beams, dtype=numpy.object)
        ordered_beams = tmp[idxs.argsort()]

        # Write the translations to the output file.
        for i, beam in enumerate(ordered_beams):
            if nbest:
                num = num_prev_translated + i
                for sent, cost in beam:
                    translation = util.seq2words(sent, num_to_target)
                    line = "{} ||| {} ||| {}\n".format(num, translation,
                                                       str(cost))
                    output_file.write(line)
            else:
                best_hypo, cost = beam[0]
                line = util.seq2words(best_hypo, num_to_target) + '\n'
                output_file.write(line)
Exemple #16
0
    def InitNet(self, netParam):
        netscheme = empty()
        netscheme.train_net, netscheme.test_net = net.NetSolve(netParam)
        message = "Training Network:" + "\n"
        message += str(netscheme.train_net)
        self.logger.info(message)

        message = "Testing Network:" + "\n"
        message += str(netscheme.test_net)
        self.logger.info(message)
        self.cnf.netscheme = netscheme
        self.ff = util.prepare_cnf(self.cnf,self.logger)
        self.inputs, self.targets, self.test = util.prepare_data(self.cnf,self.logger)
Exemple #17
0
 def translate_worker(in_queue, out_queue, model, sess, config):
     while True:
         job = in_queue.get()
         if job is None:
             break
         idx, x = job
         y_dummy = numpy.zeros(shape=(len(x),1))
         x, x_mask, _, _ = util.prepare_data(x, y_dummy, config.factors,
                                             maxlen=None)
         try:
             samples = model.beam_search(sess, x, x_mask, config.beam_size)
             out_queue.put((idx, samples))
         except:
             in_queue.put(job)
Exemple #18
0
    def _translate(self, process_id, input_item, models, sess):
        """
        Actual translation (model sampling).
        """

        # unpack input item attributes
        k = input_item.k
        x = input_item.batch
        #max_ratio = input_item.max_ratio

        y_dummy = numpy.zeros(shape=(len(x), 1))
        x, x_mask, _, _ = prepare_data(x, y_dummy, maxlen=None)

        sample = inference.beam_search(models, sess, x, x_mask, k)
        return sample
Exemple #19
0
    def _translate(self, process_id, input_item, ensemble, sess):
        """
        Actual translation (model sampling).
        """

        # unpack input item attributes
        k = input_item.k
        x = input_item.batch
        #max_ratio = input_item.max_ratio

        y_dummy = numpy.zeros(shape=(len(x), 1))
        x, x_mask, _, _ = util.prepare_data(x,
                                            y_dummy,
                                            self._options[0].factors,
                                            maxlen=None)

        sample = ensemble.beam_search(sess, x, x_mask, k)

        return sample
Exemple #20
0
def compare_head_tail_keyword_in_mashup(sample_count=50):
    '''
    以mashup中调用的api的类别(每个api取一个category)为关键词,测试返回的节点数和时间
    对一个每一组mashup中的api类别,我们采用两种形式检索,如要检索的关键词为ABCD,我们会分别检索ABCD和AD两种形式的关键词
    查看其返回的节点数和花费时间
    因为apiGraph就是按mashup来的,因此返回的节点数应该小于或等于mashup中调用的api的个数
    :param sample_count:
    :return:
    '''
    sample_dict = mashup.sample_categories_of_mashup(sample_count)

    graph, categories, _ = prepare_data()

    count_options = [2, 3, 4, 5, 6]
    num_of_options = len(count_options)

    num_nodes = np.zeros((3, num_of_options, sample_count))
    costs = np.zeros((3, num_of_options, sample_count))

    for (i, count) in enumerate(count_options):
        keywords_array = sample_dict[count]
        for j in range(sample_count):
            keywords = [keywords_array[j][0], keywords_array[j][-1]]
            num_nodes[0, i, j], costs[0, i, j] = testWithMinimalSteinerTree(
                graph, categories, keywords)
            num_nodes[1, i, j], costs[1, i, j] = testWithRandomSteinerTree(
                graph, categories, keywords)
            num_nodes[2, i, j], costs[2, i, j] = testWithGreedySteinerTree(
                graph, categories, keywords)
            if (j + 1) % 10 == 0:
                print('>')
            else:
                print('>', end='')
        print('==================%dkeyword finshed===============' % count)

    dict = {}
    dict['num_nodes'] = num_nodes.tolist()
    dict['costs'] = costs.tolist()

    with open('../outputs/compare_head_tail_keywords_in_mashup.json',
              'w') as f:
        json.dump(dict, f)
Exemple #21
0
from util import prepare_data, submit
from models.lgb import get_lgb_predictions
from models.nn import get_nn_predictions

if __name__ == "__main__":
    train_gal, train_exgal, test_gal, test_exgal, gal_class_list, exgal_class_list, test_df = prepare_data(
    )
    lgb_oof_gal, lgb_oof_exgal, lgb_test_gal, lgb_test_exgal = get_lgb_predictions(
        train_gal, train_exgal, test_gal, test_exgal)
    lgb_gal_preds = []
    for i in range(lgb_oof_gal.shape[1]):
        lgb_gal_preds.append("lgb_pred" + str(i))
        train_gal["lgb_pred" + str(i)] = lgb_oof_gal[:, i]
        test_gal["lgb_pred" + str(i)] = lgb_test_gal[:, i]

    lgb_exgal_preds = []
    for i in range(lgb_oof_exgal.shape[1]):
        lgb_exgal_preds.append("lgb_pred" + str(i))
        train_exgal["lgb_pred" + str(i)] = lgb_oof_exgal[:, i]
        test_exgal["lgb_pred" + str(i)] = lgb_test_exgal[:, i]

    oof_preds_gal, oof_preds_exgal, test_preds_gal, test_preds_exgal = get_nn_predictions(
        train_gal, train_exgal, test_gal, test_exgal)

    submit(test_df, test_preds_gal, test_preds_exgal, gal_class_list,
           exgal_class_list, "submissions/stacking.csv")
Exemple #22
0
def train(config, sess):
    assert (config.prior_model != None and (tf.train.checkpoint_exists(os.path.abspath(config.prior_model))) or (config.map_decay_c==0.0)), \
    "MAP training requires a prior model file: Use command-line option --prior_model"

    logging.info('Building model...')
    model = StandardModel(config)

    if config.optimizer == 'adam':
        optimizer = tf.train.AdamOptimizer(learning_rate=config.learning_rate)
    else:
        logging.error('No valid optimizer defined: {}'.format(
            config.optimizer))
        sys.exit(1)

    init = tf.zeros_initializer(dtype=tf.int32)
    global_step = tf.get_variable('time', [],
                                  initializer=init,
                                  trainable=False)

    if config.summaryFreq:
        summary_dir = (config.summary_dir if config.summary_dir is not None
                       else os.path.abspath(os.path.dirname(config.saveto)))
        writer = tf.summary.FileWriter(summary_dir, sess.graph)
    else:
        writer = None

    updater = ModelUpdater(config, model, optimizer, global_step, writer)

    saver, progress = init_or_restore_variables(config, sess, train=True)

    global_step.load(progress.uidx, sess)

    #save model options
    config_as_dict = OrderedDict(sorted(vars(config).items()))
    json.dump(config_as_dict, open('%s.json' % config.saveto, 'wb'), indent=2)

    text_iterator, valid_text_iterator = load_data(config)
    _, _, num_to_source, num_to_target = load_dictionaries(config)
    total_loss = 0.
    n_sents, n_words = 0, 0
    last_time = time.time()
    logging.info("Initial uidx={}".format(progress.uidx))
    for progress.eidx in xrange(progress.eidx, config.max_epochs):
        logging.info('Starting epoch {0}'.format(progress.eidx))
        for source_sents, target_sents in text_iterator:
            print("")
            print("")
            print("")
            print("########## Source Sents ############")
            print(source_sents)
            print("")
            print("")
            print("")
            print("########## Target Sents ############")
            print(target_sents)
            if len(source_sents[0][0]) != config.factors:
                logging.error(
                    'Mismatch between number of factors in settings ({0}), and number in training corpus ({1})\n'
                    .format(config.factors, len(source_sents[0][0])))
                sys.exit(1)
            x_in, x_mask_in, y_in, y_mask_in = util.prepare_data(
                source_sents, target_sents, config.factors, maxlen=None)
            if x_in is None:
                logging.info(
                    'Minibatch with zero sample under length {0}'.format(
                        config.maxlen))
                continue
            write_summary_for_this_batch = config.summaryFreq and (
                (progress.uidx % config.summaryFreq == 0) or
                (config.finish_after
                 and progress.uidx % config.finish_after == 0))
            (factors, seqLen, batch_size) = x_in.shape

            loss = updater.update(sess, x_in, x_mask_in, y_in, y_mask_in,
                                  write_summary_for_this_batch)
            total_loss += loss
            n_sents += batch_size
            n_words += int(numpy.sum(y_mask_in))
            progress.uidx += 1

            if config.dispFreq and progress.uidx % config.dispFreq == 0:
                duration = time.time() - last_time
                disp_time = datetime.now().strftime('[%Y-%m-%d %H:%M:%S]')
                logging.info(
                    '{0} Epoch: {1} Update: {2} Loss/word: {3} Words/sec: {4} Sents/sec: {5}'
                    .format(disp_time, progress.eidx, progress.uidx,
                            total_loss / n_words, n_words / duration,
                            n_sents / duration))
                last_time = time.time()
                total_loss = 0.
                n_sents = 0
                n_words = 0

            if config.sampleFreq and progress.uidx % config.sampleFreq == 0:
                x_small, x_mask_small, y_small = x_in[:, :, :
                                                      10], x_mask_in[:, :
                                                                     10], y_in[:, :
                                                                               10]
                samples = model.sample(sess, x_small, x_mask_small)
                assert len(samples) == len(x_small.T) == len(
                    y_small.T), (len(samples), x_small.shape, y_small.shape)
                for xx, yy, ss in zip(x_small.T, y_small.T, samples):
                    source = util.factoredseq2words(xx, num_to_source)
                    target = util.seq2words(yy, num_to_target)
                    sample = util.seq2words(ss, num_to_target)
                    logging.info('SOURCE: {}'.format(source))
                    logging.info('TARGET: {}'.format(target))
                    logging.info('SAMPLE: {}'.format(sample))

            if config.beamFreq and progress.uidx % config.beamFreq == 0:
                x_small, x_mask_small, y_small = x_in[:, :, :
                                                      10], x_mask_in[:, :
                                                                     10], y_in[:, :
                                                                               10]
                samples = model.beam_search(sess, x_small, x_mask_small,
                                            config.beam_size)
                # samples is a list with shape batch x beam x len
                assert len(samples) == len(x_small.T) == len(
                    y_small.T), (len(samples), x_small.shape, y_small.shape)
                for xx, yy, ss in zip(x_small.T, y_small.T, samples):
                    source = util.factoredseq2words(xx, num_to_source)
                    target = util.seq2words(yy, num_to_target)
                    logging.info('SOURCE: {}'.format(source))
                    logging.info('TARGET: {}'.format(target))
                    for i, (sample_seq, cost) in enumerate(ss):
                        sample = util.seq2words(sample_seq, num_to_target)
                        msg = 'SAMPLE {}: {} Cost/Len/Avg {}/{}/{}'.format(
                            i, sample, cost, len(sample), cost / len(sample))
                        logging.info(msg)

            if config.validFreq and progress.uidx % config.validFreq == 0:
                costs = validate(config, sess, valid_text_iterator, model)
                # validation loss is mean of normalized sentence log probs
                valid_loss = sum(costs) / len(costs)
                if (len(progress.history_errs) == 0
                        or valid_loss < min(progress.history_errs)):
                    progress.history_errs.append(valid_loss)
                    progress.bad_counter = 0
                    saver.save(sess, save_path=config.saveto)
                    progress_path = '{0}.progress.json'.format(config.saveto)
                    progress.save_to_json(progress_path)
                else:
                    progress.history_errs.append(valid_loss)
                    progress.bad_counter += 1
                    if progress.bad_counter > config.patience:
                        logging.info('Early Stop!')
                        progress.estop = True
                        break
                if config.valid_script is not None:
                    score = validate_with_script(sess, model, config,
                                                 valid_text_iterator)
                    need_to_save = (
                        score is not None
                        and (len(progress.valid_script_scores) == 0
                             or score > max(progress.valid_script_scores)))
                    if score is None:
                        score = 0.0  # ensure a valid value is written
                    progress.valid_script_scores.append(score)
                    if need_to_save:
                        save_path = config.saveto + ".best-valid-script"
                        saver.save(sess, save_path=save_path)
                        progress_path = '{}.progress.json'.format(save_path)
                        progress.save_to_json(progress_path)

            if config.saveFreq and progress.uidx % config.saveFreq == 0:
                saver.save(sess,
                           save_path=config.saveto,
                           global_step=progress.uidx)
                progress_path = '{0}-{1}.progress.json'.format(
                    config.saveto, progress.uidx)
                progress.save_to_json(progress_path)

            if config.finish_after and progress.uidx % config.finish_after == 0:
                logging.info("Maximum number of updates reached")
                saver.save(sess,
                           save_path=config.saveto,
                           global_step=progress.uidx)
                progress.estop = True
                progress_path = '{0}-{1}.progress.json'.format(
                    config.saveto, progress.uidx)
                progress.save_to_json(progress_path)
                break
        if progress.estop:
            break
Exemple #23
0
def player_list():
    entries = prepare_data()
    players = get_player_info(entries)
    return json.dumps({'players': players})
Exemple #24
0
def train(config, sess):
    assert (config.prior_model != None and (tf.train.checkpoint_exists(os.path.abspath(config.prior_model))) or (config.map_decay_c==0.0)), \
    "MAP training requires a prior model file: Use command-line option --prior_model"

    # Construct the graph, with one model replica per GPU

    num_gpus = len(tf_utils.get_available_gpus())
    num_replicas = max(1, num_gpus)

    if config.loss_function == 'MRT':
        assert config.gradient_aggregation_steps == 1
        assert config.max_sentences_per_device == 0, "MRT mode does not support sentence-based split"
        if config.max_tokens_per_device != 0:
            assert (config.samplesN * config.maxlen <= config.max_tokens_per_device), "need to make sure candidates of a sentence could be " \
                                                                                      "feed into the model"
        else:
            assert num_replicas == 1, "MRT mode does not support sentence-based split"
            assert (config.samplesN * config.maxlen <= config.token_batch_size), "need to make sure candidates of a sentence could be " \
                                                                                      "feed into the model"



    logging.info('Building model...')
    replicas = []
    for i in range(num_replicas):
        device_type = "GPU" if num_gpus > 0 else "CPU"
        device_spec = tf.DeviceSpec(device_type=device_type, device_index=i)
        with tf.device(device_spec):
            with tf.variable_scope(tf.get_variable_scope(), reuse=(i>0)):
                if config.model_type == "transformer":
                    model = TransformerModel(config)
                else:
                    model = rnn_model.RNNModel(config)
                replicas.append(model)

    init = tf.zeros_initializer(dtype=tf.int32)
    global_step = tf.get_variable('time', [], initializer=init, trainable=False)

    if config.learning_schedule == "constant":
        schedule = learning_schedule.ConstantSchedule(config.learning_rate)
    elif config.learning_schedule == "transformer":
        schedule = learning_schedule.TransformerSchedule(
            global_step=global_step,
            dim=config.state_size,
            warmup_steps=config.warmup_steps)
    elif config.learning_schedule == "warmup-plateau-decay":
        schedule = learning_schedule.WarmupPlateauDecaySchedule(
            global_step=global_step,
            peak_learning_rate=config.learning_rate,
            warmup_steps=config.warmup_steps,
            plateau_steps=config.plateau_steps)
    else:
        logging.error('Learning schedule type is not valid: {}'.format(
            config.learning_schedule))
        sys.exit(1)

    if config.optimizer == 'adam':
        optimizer = tf.train.AdamOptimizer(learning_rate=schedule.learning_rate,
                                           beta1=config.adam_beta1,
                                           beta2=config.adam_beta2,
                                           epsilon=config.adam_epsilon)
    else:
        logging.error('No valid optimizer defined: {}'.format(config.optimizer))
        sys.exit(1)

    if config.summary_freq:
        summary_dir = (config.summary_dir if config.summary_dir is not None
                       else os.path.abspath(os.path.dirname(config.saveto)))
        writer = tf.summary.FileWriter(summary_dir, sess.graph)
    else:
        writer = None

    updater = ModelUpdater(config, num_gpus, replicas, optimizer, global_step,
                           writer)

    if config.exponential_smoothing > 0.0:
        smoothing = ExponentialSmoothing(config.exponential_smoothing)

    saver, progress = model_loader.init_or_restore_variables(
        config, sess, train=True)

    global_step.load(progress.uidx, sess)

    if config.sample_freq:
        random_sampler = RandomSampler(
            models=[replicas[0]],
            configs=[config],
            beam_size=1)

    if config.beam_freq or config.valid_script is not None:
        beam_search_sampler = BeamSearchSampler(
            models=[replicas[0]],
            configs=[config],
            beam_size=config.beam_size)

    #save model options
    write_config_to_json_file(config, config.saveto)

    text_iterator, valid_text_iterator = load_data(config)
    _, _, num_to_source, num_to_target = util.load_dictionaries(config)
    total_loss = 0.
    n_sents, n_words = 0, 0
    last_time = time.time()
    logging.info("Initial uidx={}".format(progress.uidx))
    # set epoch = 1 if print per-token-probability
    if config.print_per_token_pro:
        config.max_epochs = progress.eidx+1
    for progress.eidx in range(progress.eidx, config.max_epochs):
        logging.info('Starting epoch {0}'.format(progress.eidx))
        for source_sents, target_sents in text_iterator:
            if len(source_sents[0][0]) != config.factors:
                logging.error('Mismatch between number of factors in settings ({0}), and number in training corpus ({1})\n'.format(config.factors, len(source_sents[0][0])))
                sys.exit(1)
            x_in, x_mask_in, y_in, y_mask_in = util.prepare_data(
                source_sents, target_sents, config.factors, maxlen=None)
            if x_in is None:
                logging.info('Minibatch with zero sample under length {0}'.format(config.maxlen))
                continue
            write_summary_for_this_batch = config.summary_freq and ((progress.uidx % config.summary_freq == 0) or (config.finish_after and progress.uidx % config.finish_after == 0))
            (factors, seqLen, batch_size) = x_in.shape

            output = updater.update(
                sess, x_in, x_mask_in, y_in, y_mask_in, num_to_target,
                write_summary_for_this_batch)

            if config.print_per_token_pro == False:
                total_loss += output
            else:
                # write per-token probability into the file
                f = open(config.print_per_token_pro, 'a')
                for pro in output:
                    pro = str(pro) + '\n'
                    f.write(pro)
                f.close()

            n_sents += batch_size
            n_words += int(numpy.sum(y_mask_in))
            progress.uidx += 1

            # Update the smoothed version of the model variables.
            # To reduce the performance overhead, we only do this once every
            # N steps (the smoothing factor is adjusted accordingly).
            if config.exponential_smoothing > 0.0 and progress.uidx % smoothing.update_frequency == 0:
                sess.run(fetches=smoothing.update_ops)

            if config.disp_freq and progress.uidx % config.disp_freq == 0:
                duration = time.time() - last_time
                disp_time = datetime.now().strftime('[%Y-%m-%d %H:%M:%S]')
                logging.info('{0} Epoch: {1} Update: {2} Loss/word: {3} Words/sec: {4} Sents/sec: {5}'.format(disp_time, progress.eidx, progress.uidx, total_loss/n_words, n_words/duration, n_sents/duration))
                last_time = time.time()
                total_loss = 0.
                n_sents = 0
                n_words = 0

            if config.sample_freq and progress.uidx % config.sample_freq == 0:
                x_small = x_in[:, :, :10]
                x_mask_small = x_mask_in[:, :10]
                y_small = y_in[:, :10]
                samples = translate_utils.translate_batch(
                    sess, random_sampler, x_small, x_mask_small,
                    config.translation_maxlen, 0.0)
                assert len(samples) == len(x_small.T) == len(y_small.T), \
                    (len(samples), x_small.shape, y_small.shape)
                for xx, yy, ss in zip(x_small.T, y_small.T, samples):
                    source = util.factoredseq2words(xx, num_to_source)
                    target = util.seq2words(yy, num_to_target)
                    sample = util.seq2words(ss[0][0], num_to_target)
                    logging.info('SOURCE: {}'.format(source))
                    logging.info('TARGET: {}'.format(target))
                    logging.info('SAMPLE: {}'.format(sample))

            if config.beam_freq and progress.uidx % config.beam_freq == 0:
                x_small = x_in[:, :, :10]
                x_mask_small = x_mask_in[:, :10]
                y_small = y_in[:,:10]
                samples = translate_utils.translate_batch(
                    sess, beam_search_sampler, x_small, x_mask_small,
                    config.translation_maxlen, config.normalization_alpha)
                assert len(samples) == len(x_small.T) == len(y_small.T), \
                    (len(samples), x_small.shape, y_small.shape)
                for xx, yy, ss in zip(x_small.T, y_small.T, samples):
                    source = util.factoredseq2words(xx, num_to_source)
                    target = util.seq2words(yy, num_to_target)
                    logging.info('SOURCE: {}'.format(source))
                    logging.info('TARGET: {}'.format(target))
                    for i, (sample_seq, cost) in enumerate(ss):
                        sample = util.seq2words(sample_seq, num_to_target)
                        msg = 'SAMPLE {}: {} Cost/Len/Avg {}/{}/{}'.format(
                            i, sample, cost, len(sample), cost/len(sample))
                        logging.info(msg)

            if config.valid_freq and progress.uidx % config.valid_freq == 0:
                if config.exponential_smoothing > 0.0:
                    sess.run(fetches=smoothing.swap_ops)
                    valid_ce = validate(sess, replicas[0], config,
                                        valid_text_iterator)
                    sess.run(fetches=smoothing.swap_ops)
                else:
                    valid_ce = validate(sess, replicas[0], config,
                                        valid_text_iterator)
                if (len(progress.history_errs) == 0 or
                    valid_ce < min(progress.history_errs)):
                    progress.history_errs.append(valid_ce)
                    progress.bad_counter = 0
                    save_non_checkpoint(sess, saver, config.saveto)
                    progress_path = '{0}.progress.json'.format(config.saveto)
                    progress.save_to_json(progress_path)
                else:
                    progress.history_errs.append(valid_ce)
                    progress.bad_counter += 1
                    if progress.bad_counter > config.patience:
                        logging.info('Early Stop!')
                        progress.estop = True
                        break
                if config.valid_script is not None:
                    if config.exponential_smoothing > 0.0:
                        sess.run(fetches=smoothing.swap_ops)
                        score = validate_with_script(sess, beam_search_sampler)
                        sess.run(fetches=smoothing.swap_ops)
                    else:
                        score = validate_with_script(sess, beam_search_sampler)
                    need_to_save = (score is not None and
                        (len(progress.valid_script_scores) == 0 or
                         score > max(progress.valid_script_scores)))
                    if score is None:
                        score = 0.0  # ensure a valid value is written
                    progress.valid_script_scores.append(score)
                    if need_to_save:
                        progress.bad_counter = 0
                        save_path = config.saveto + ".best-valid-script"
                        save_non_checkpoint(sess, saver, save_path)
                        write_config_to_json_file(config, save_path)

                        progress_path = '{}.progress.json'.format(save_path)
                        progress.save_to_json(progress_path)

            if config.save_freq and progress.uidx % config.save_freq == 0:
                saver.save(sess, save_path=config.saveto, global_step=progress.uidx)
                write_config_to_json_file(config, "%s-%s" % (config.saveto, progress.uidx))

                progress_path = '{0}-{1}.progress.json'.format(config.saveto, progress.uidx)
                progress.save_to_json(progress_path)

            if config.finish_after and progress.uidx % config.finish_after == 0:
                logging.info("Maximum number of updates reached")
                saver.save(sess, save_path=config.saveto, global_step=progress.uidx)
                write_config_to_json_file(config, "%s-%s" % (config.saveto, progress.uidx))

                progress.estop=True
                progress_path = '{0}-{1}.progress.json'.format(config.saveto, progress.uidx)
                progress.save_to_json(progress_path)
                break
        if progress.estop:
            break
            # using gredient to update model parameters
            optimizer.step()
            iters += 1
            if iters % 300 == 0:
                for test_val, test_target in test_loader:
                    test_outputs = Mymodel(test_val)
                    loss2 = loss_function(test_outputs, test_target)
                print('Iteration: {}. TrainLoss: {}. TestLoss: {}'.format(
                    iters, loss.item(), loss2.item()))
                torch.save(
                    Mymodel.state_dict(),
                    'Trained_model/trained_model_' + str(iters) + '.pkl')

    plt.plot(hisloss)
    plt.xlabel('Iteration')
    plt.ylabel('Training loss')
    plt.title('Traing process')
    plt.grid(True)
    plt.savefig('Trained_model/loss.png')
    return Mymodel


def prediction(Mymodel, seq):
    return


if __name__ == "__main__":
    data = get_ts_dxy(1)
    train, test = prepare_data(data, 8)
    Mymodel = training_model(train, test, num_epochs=10)
Exemple #26
0
def train(in_args):
    """Starts the training process of the neural network.
    """
    ##Peparing data
    train_loader, test_loader, validation_loader, number_of_classes, data_labels = util.prepare_data(
        in_args.data_directory)
    #Preparing Model, Criterion and Optimizer
    if (in_args.checkpoint):
        model, data_labels, criterion, optimizer = load_checkpoint(
            in_args.checkpoint, in_args.gpu, True)
    else:
        model = create_model(gpu=in_args.gpu,
                             arch=in_args.arch,
                             hidden_layer_size=in_args.hidden_units,
                             output_size=number_of_classes)
        criterion = nn.NLLLoss()
        optimizer = optim.Adam(model.classifier.parameters(),
                               lr=in_args.learning_rate)
    if (model):
        device = 'cuda' if in_args.gpu else 'cpu'
        running_loss = 0
        for epoch in range(in_args.epochs):
            steps = 0
            for inputs, labels in train_loader:
                steps += 1
                inputs, labels = inputs.to(device), labels.to(device)
                optimizer.zero_grad()
                output = model(inputs)
                loss = criterion(output, labels)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()

                if steps % 5 == 0:
                    test_loss = 0
                    accuracy = 0
                    model.eval()

                    with torch.no_grad():
                        for inputs, labels in test_loader:
                            inputs, labels = inputs.to(device), labels.to(
                                device)
                            output = model(inputs)
                            batch_loss = criterion(output, labels)
                            test_loss += batch_loss.item()
                            probability = torch.exp(output)
                            top_p, top_class = probability.topk(1, dim=1)
                            equals = top_class == labels.view(*top_class.shape)
                            accuracy += torch.mean(
                                equals.type(torch.FloatTensor)).item()
                    print("---------")
                    print("Epoch {}/{}. Step {}.".format(
                        (epoch + 1), in_args.epochs, steps))
                    print("Train loss: {}".format("%.3f" % (running_loss / 5)))
                    print("Test loss: {}".format(
                        "%.3f" % (test_loss / len(test_loader))))
                    print("Test accuracy: {}".format(
                        "%.3f" % (accuracy / len(test_loader))))
                    print("---------")
                    running_loss = 0
                    model.train()

        save_directory = in_args.save_dir
        print(
            "Network training finished.\nPlease wait while progress is being saved to file {}."
            .format(save_directory))
        checkpoint = {
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'hidden_layer_size': in_args.hidden_units,
            'output_size': number_of_classes,
            'architecture': in_args.arch,
            'data_labels': data_labels
        }
        torch.save(checkpoint, save_directory)
        print(
            "The neural network will perform a validation test. Please wait.")
        validation_loss = 0
        accuracy = 0
        model.eval()
        with torch.no_grad():
            for inputs, labels in validation_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                output = model(inputs)
                batch_loss = criterion(output, labels)
                validation_loss += batch_loss.item()
                probability = torch.exp(output)
                top_p, top_class = probability.topk(1, dim=1)
                equals = top_class == labels.view(*top_class.shape)
                accuracy += torch.mean(equals.type(torch.FloatTensor)).item()

        print("Validation loss: {}".format(
            "%.3f" % (validation_loss / len(validation_loader))))
        print("Validation accuracy: {}".format(
            "%.3f" % (accuracy / len(validation_loader))))
        print("Above results show expected performance during inference.")
Exemple #27
0
def train(config, sess):
    assert (config.prior_model != None and (tf.train.checkpoint_exists(os.path.abspath(config.prior_model))) or (config.map_decay_c==0.0)), \
    "MAP training requires a prior model file: Use command-line option --prior_model"

    # Construct the graph, with one model replica per GPU

    num_gpus = len(util.get_available_gpus())
    num_replicas = max(1, num_gpus)

    logging.info('Building model...')
    replicas = []
    for i in range(num_replicas):
        device_type = "GPU" if num_gpus > 0 else "CPU"
        device_spec = tf.DeviceSpec(device_type=device_type, device_index=i)
        with tf.device(device_spec):
            with tf.variable_scope(tf.get_variable_scope(), reuse=(i>0)):
                if config.model_type == "transformer":
                    model = TransformerModel(config)
                else:
                    model = rnn_model.RNNModel(config)
                replicas.append(model)

    init = tf.zeros_initializer(dtype=tf.int32)
    global_step = tf.get_variable('time', [], initializer=init, trainable=False)

    if config.learning_schedule == "constant":
        schedule = ConstantSchedule(config.learning_rate)
    elif config.learning_schedule == "transformer":
        schedule = TransformerSchedule(global_step=global_step,
                                       dim=config.state_size,
                                       warmup_steps=config.warmup_steps)
    else:
        logging.error('Learning schedule type is not valid: {}'.format(
            config.learning_schedule))
        sys.exit(1)

    if config.optimizer == 'adam':
        optimizer = tf.train.AdamOptimizer(learning_rate=schedule.learning_rate,
                                           beta1=config.adam_beta1,
                                           beta2=config.adam_beta2,
                                           epsilon=config.adam_epsilon)
    else:
        logging.error('No valid optimizer defined: {}'.format(config.optimizer))
        sys.exit(1)

    if config.summary_freq:
        summary_dir = (config.summary_dir if config.summary_dir is not None
                       else os.path.abspath(os.path.dirname(config.saveto)))
        writer = tf.summary.FileWriter(summary_dir, sess.graph)
    else:
        writer = None

    updater = ModelUpdater(config, num_gpus, replicas, optimizer, global_step,
                           writer)

    saver, progress = model_loader.init_or_restore_variables(
        config, sess, train=True)

    global_step.load(progress.uidx, sess)

    # Use an InferenceModelSet to abstract over model types for sampling and
    # beam search. Multi-GPU sampling and beam search are not currently
    # supported, so we just use the first replica.
    model_set = inference.InferenceModelSet([replicas[0]], [config])

    #save model options
    write_config_to_json_file(config, config.saveto)

    text_iterator, valid_text_iterator = load_data(config)
    _, _, num_to_source, num_to_target = util.load_dictionaries(config)
    total_loss = 0.
    n_sents, n_words = 0, 0
    last_time = time.time()
    logging.info("Initial uidx={}".format(progress.uidx))
    for progress.eidx in range(progress.eidx, config.max_epochs):
        logging.info('Starting epoch {0}'.format(progress.eidx))
        for source_sents, target_sents in text_iterator:
            if len(source_sents[0][0]) != config.factors:
                logging.error('Mismatch between number of factors in settings ({0}), and number in training corpus ({1})\n'.format(config.factors, len(source_sents[0][0])))
                sys.exit(1)
            x_in, x_mask_in, y_in, y_mask_in = util.prepare_data(
                source_sents, target_sents, config.factors, maxlen=None)
            if x_in is None:
                logging.info('Minibatch with zero sample under length {0}'.format(config.maxlen))
                continue
            write_summary_for_this_batch = config.summary_freq and ((progress.uidx % config.summary_freq == 0) or (config.finish_after and progress.uidx % config.finish_after == 0))
            (factors, seqLen, batch_size) = x_in.shape

            loss = updater.update(sess, x_in, x_mask_in, y_in, y_mask_in,
                                  write_summary_for_this_batch)
            total_loss += loss
            n_sents += batch_size
            n_words += int(numpy.sum(y_mask_in))
            progress.uidx += 1

            if config.disp_freq and progress.uidx % config.disp_freq == 0:
                duration = time.time() - last_time
                disp_time = datetime.now().strftime('[%Y-%m-%d %H:%M:%S]')
                logging.info('{0} Epoch: {1} Update: {2} Loss/word: {3} Words/sec: {4} Sents/sec: {5}'.format(disp_time, progress.eidx, progress.uidx, total_loss/n_words, n_words/duration, n_sents/duration))
                last_time = time.time()
                total_loss = 0.
                n_sents = 0
                n_words = 0

            if config.sample_freq and progress.uidx % config.sample_freq == 0:
                x_small, x_mask_small, y_small = x_in[:, :, :10], x_mask_in[:, :10], y_in[:, :10]
                samples = model_set.sample(sess, x_small, x_mask_small)
                assert len(samples) == len(x_small.T) == len(y_small.T), (len(samples), x_small.shape, y_small.shape)
                for xx, yy, ss in zip(x_small.T, y_small.T, samples):
                    source = util.factoredseq2words(xx, num_to_source)
                    target = util.seq2words(yy, num_to_target)
                    sample = util.seq2words(ss, num_to_target)
                    logging.info('SOURCE: {}'.format(source))
                    logging.info('TARGET: {}'.format(target))
                    logging.info('SAMPLE: {}'.format(sample))

            if config.beam_freq and progress.uidx % config.beam_freq == 0:
                x_small, x_mask_small, y_small = x_in[:, :, :10], x_mask_in[:, :10], y_in[:,:10]
                samples = model_set.beam_search(sess, x_small, x_mask_small,
                                               config.beam_size,
                                               normalization_alpha=config.normalization_alpha)
                # samples is a list with shape batch x beam x len
                assert len(samples) == len(x_small.T) == len(y_small.T), (len(samples), x_small.shape, y_small.shape)
                for xx, yy, ss in zip(x_small.T, y_small.T, samples):
                    source = util.factoredseq2words(xx, num_to_source)
                    target = util.seq2words(yy, num_to_target)
                    logging.info('SOURCE: {}'.format(source))
                    logging.info('TARGET: {}'.format(target))
                    for i, (sample_seq, cost) in enumerate(ss):
                        sample = util.seq2words(sample_seq, num_to_target)
                        msg = 'SAMPLE {}: {} Cost/Len/Avg {}/{}/{}'.format(
                            i, sample, cost, len(sample), cost/len(sample))
                        logging.info(msg)

            if config.valid_freq and progress.uidx % config.valid_freq == 0:
                valid_ce = validate(sess, replicas[0], config,
                                    valid_text_iterator)
                if (len(progress.history_errs) == 0 or
                    valid_ce < min(progress.history_errs)):
                    progress.history_errs.append(valid_ce)
                    progress.bad_counter = 0
                    save_non_checkpoint(sess, saver, config.saveto)
                    progress_path = '{0}.progress.json'.format(config.saveto)
                    progress.save_to_json(progress_path)
                else:
                    progress.history_errs.append(valid_ce)
                    progress.bad_counter += 1
                    if progress.bad_counter > config.patience:
                        logging.info('Early Stop!')
                        progress.estop = True
                        break
                if config.valid_script is not None:
                    score = validate_with_script(sess, replicas[0], config)
                    need_to_save = (score is not None and
                        (len(progress.valid_script_scores) == 0 or
                         score > max(progress.valid_script_scores)))
                    if score is None:
                        score = 0.0  # ensure a valid value is written
                    progress.valid_script_scores.append(score)
                    if need_to_save:
                        progress.bad_counter = 0
                        save_path = config.saveto + ".best-valid-script"
                        save_non_checkpoint(sess, saver, save_path)
                        write_config_to_json_file(config, save_path)

                        progress_path = '{}.progress.json'.format(save_path)
                        progress.save_to_json(progress_path)

            if config.save_freq and progress.uidx % config.save_freq == 0:
                saver.save(sess, save_path=config.saveto, global_step=progress.uidx)
                write_config_to_json_file(config, "%s-%s" % (config.saveto, progress.uidx))

                progress_path = '{0}-{1}.progress.json'.format(config.saveto, progress.uidx)
                progress.save_to_json(progress_path)

            if config.finish_after and progress.uidx % config.finish_after == 0:
                logging.info("Maximum number of updates reached")
                saver.save(sess, save_path=config.saveto, global_step=progress.uidx)
                write_config_to_json_file(config, "%s-%s" % (config.saveto, progress.uidx))

                progress.estop=True
                progress_path = '{0}-{1}.progress.json'.format(config.saveto, progress.uidx)
                progress.save_to_json(progress_path)
                break
        if progress.estop:
            break
Exemple #28
0
def training_2d(train, test):
    config = Config(sampling_rate=44100, audio_duration=2, n_folds=n_folds, 
                    learning_rate=0.001, use_mfcc=True, n_mfcc=40)
    if DEBUG:
        config = Config(sampling_rate=44100, audio_duration=2, n_folds=n_folds, 
                        max_epochs=1, use_mfcc=True, n_mfcc=40)

    X_train = prepare_data(train, config, '../data/audio_train/')
    X_test = prepare_data(test, config, '../data/audio_test/')
    y_train = to_categorical(train.label_idx, num_classes=config.n_classes)
    
    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0)
    
    X_train = (X_train - mean)/std
    X_test = (X_test - mean)/std
    
    PREDICTION_FOLDER = "predictions_2d_conv"
    if not os.path.exists(PREDICTION_FOLDER):
        os.mkdir(PREDICTION_FOLDER)
    if os.path.exists('logs/' + PREDICTION_FOLDER):
        shutil.rmtree('logs/' + PREDICTION_FOLDER)
    
    skf = StratifiedKFold(train.label_idx, n_folds=config.n_folds)
    for i, (train_split, val_split) in enumerate(skf):
        K.clear_session()
        X, y, X_val, y_val = X_train[train_split], y_train[train_split], X_train[val_split], y_train[val_split]
        checkpoint = ModelCheckpoint('../model/best2d_%d.h5'%i, monitor='val_acc', verbose=1, save_best_only=True)
        early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
        tb = TensorBoard(log_dir='./logs/' + PREDICTION_FOLDER + '/fold_%i'%i, write_graph=True)
        callbacks_list = [checkpoint, early, tb]
        print("#"*50)
        print("Fold: ", i)
        model = get_2d_conv_model_advance(config)
        history = model.fit(X, y, validation_data=(X_val, y_val), callbacks=callbacks_list, 
                            batch_size=64, epochs=config.max_epochs)
        model.load_weights('../model/best2d_%d.h5'%i)
    
        # Save train predictions
        #predictions = model.predict(X_train, batch_size=64, verbose=1)
        #np.save(PREDICTION_FOLDER + "/train_predictions_%d.npy"%i, predictions)
    
        # Save test predictions
        predictions = model.predict(X_test, batch_size=64, verbose=1)
        np.save(PREDICTION_FOLDER + "/test_predictions_%d.npy"%i, predictions)
    
        K.clear_session()
        
    pred_list = []
    for i in range(n_folds):
        pred_list.append(np.load("predictions_2d_conv/test_predictions_%d.npy"%i))
    prediction = np.ones_like(pred_list[0])
    for pred in pred_list:
        prediction = prediction*pred
    prediction = prediction**(1./len(pred_list))
    # Make a submission file
    top_3 = np.array(LABELS)[np.argsort(-prediction, axis=1)[:, :3]]
    predicted_labels = [' '.join(list(x)) for x in top_3]
    test = pd.read_csv('../data/sample_submission.csv')
    test['label'] = predicted_labels
    
    test[['fname', 'label']].to_csv("../result/2d_conv.csv", index=False)
    if not DEBUG:
        command = '/home/kownse/anaconda3/bin/kaggle competitions submit -c freesound-audio-tagging -f ../result/2d_conv.csv -m "submit"'.format(competition, file_7z)
        os.system(command)
Exemple #29
0
def train(config, sess):
    ####################################################
    assert (config.prior_model != None and (tf.train.checkpoint_exists(os.path.abspath(config.prior_model))) or (config.map_decay_c==0.0)), \
    "MAP training requires a prior model file: Use command-line option --prior_model"

    # Construct the graph, with one model replica per GPU

    num_gpus = len(util.get_available_gpus())
    num_replicas = max(1, num_gpus)

    logging.info('Building model...')
    replicas = []
    for i in range(num_replicas):
        device_type = "GPU" if num_gpus > 0 else "CPU"
        device_spec = tf.DeviceSpec(device_type=device_type, device_index=i)
        with tf.device(device_spec):
            with tf.variable_scope(tf.get_variable_scope(), reuse=(i > 0)):
                if config.model_type == "transformer":
                    model = TransformerModel(config)
                else:
                    model = rnn_model.RNNModel(config)
                replicas.append(model)

    init = tf.zeros_initializer(dtype=tf.int32)
    global_step = tf.get_variable('time', [],
                                  initializer=init,
                                  trainable=False)

    if config.learning_schedule == "constant":
        schedule = ConstantSchedule(config.learning_rate)
    elif config.learning_schedule == "transformer":
        schedule = TransformerSchedule(global_step=global_step,
                                       dim=config.state_size,
                                       warmup_steps=config.warmup_steps)
    else:
        logging.error('Learning schedule type is not valid: {}'.format(
            config.learning_schedule))
        sys.exit(1)

    if config.optimizer == 'adam':
        optimizer = tf.train.AdamOptimizer(
            learning_rate=schedule.learning_rate,
            beta1=config.adam_beta1,
            beta2=config.adam_beta2,
            epsilon=config.adam_epsilon)
    else:
        logging.error('No valid optimizer defined: {}'.format(
            config.optimizer))
        sys.exit(1)

    if config.summary_freq:
        summary_dir = (config.summary_dir if config.summary_dir is not None
                       else os.path.abspath(os.path.dirname(config.saveto)))
        writer = tf.summary.FileWriter(summary_dir, sess.graph)
    else:
        writer = None

    updater = ModelUpdater(config, num_gpus, replicas, optimizer, global_step,
                           writer)

    saver, progress = model_loader.init_or_restore_variables(config,
                                                             sess,
                                                             train=True)

    ############################################################
    #add: pretrain
    if config.pretrain:
        logging.info("Start pre-training")
        #预训练网络参数
        pre_batch_size = 1000
        epochs = 20
        pre_learning_rate = 0.001
        pre_optimizer = tf.train.GradientDescentOptimizer(
            pre_learning_rate).minimize(replicas[0].loss_pre_train)
        #加载预训练数据及相关字典
        gvocab, gvectors = util.pre_load_data(config.pretrain_vocab,
                                              config.pretrain_vectors)
        pre_vocab_list = list(gvocab.keys())
        #过采样
        pre_train_list = []
        with open('/media/ntfs-3/EXP/MULTI/mix/zh-en/data3/glove/vocab.txt',
                  'r',
                  encoding='utf-8') as f:
            for line in f:
                k, v = line.strip().split()
                pre_train_list.extend([k] * int(v))
        utf8_dict = json.load(
            open(config.source_dicts[0], 'r', encoding='utf-8'))
        embedding_list = []
        #开始训练
        for i in range(epochs):
            logging.info("epoch:{}".format(i))
            if i == epochs - 1:
                source_x, source_y, _vocab = util.get_data(pre_vocab_list,
                                                           pre_batch_size,
                                                           gvocab,
                                                           gvectors,
                                                           utf8_dict,
                                                           shuffle=False)
            else:
                source_x, source_y, _vocab = util.get_data(pre_train_list,
                                                           pre_batch_size,
                                                           gvocab,
                                                           gvectors,
                                                           utf8_dict,
                                                           shuffle=True)
            for idx, [s_x, s_y] in enumerate(zip(source_x, source_y)):
                assert len(s_x) == len(s_y), "{}, {}".format(
                    len(s_x), len(s_y))
                sx, sy = util.pre_prepare_data(s_x, s_y)
                feed_dict = {}
                feed_dict[replicas[0].pre_inputs.x] = sx
                feed_dict[replicas[0].pre_inputs.y] = sy
                _, loss, embedding = sess.run([
                    pre_optimizer, replicas[0].loss_pre_train,
                    replicas[0].pre_embedding
                ],
                                              feed_dict=feed_dict)
                if idx % 100 == 0:
                    logging.info("loss:{}".format(loss))
                if i == epochs - 1:
                    embedding_list.append(embedding)
        assert _vocab == pre_vocab_list
        emb = embedding_list[0]
        for e in embedding_list[1:]:
            emb = numpy.concatenate((emb, e))
        numpy.save("pre_emb/pre_emb.npy", emb)
        with open("pre_emb/vocab", "w", encoding="utf-8") as f:
            f.write("\n".join(pre_vocab_list))
        #tsne可视化
        tsne = util.get_tsne(emb, "pre_emb/tsne.npy")
        gtsne = numpy.load(config.pretrain_tsne)
        #util.plot_tsne(_vocab, tsne, gvocab, gtsne, top=20)
        #exit(0)
    ##################################################################################

    global_step.load(progress.uidx, sess)

    # Use an InferenceModelSet to abstract over model types for sampling and
    # beam search. Multi-GPU sampling and beam search are not currently
    # supported, so we just use the first replica.
    model_set = inference.InferenceModelSet([replicas[0]], [config])

    #save model options
    write_config_to_json_file(config, config.saveto)

    text_iterator, valid_text_iterator = load_data(config)
    _, _, num_to_source, num_to_target = util.load_dictionaries(config)
    total_loss = 0.
    n_sents, n_words = 0, 0
    last_time = time.time()
    logging.info("Initial uidx={}".format(progress.uidx))
    for progress.eidx in range(progress.eidx, config.max_epochs):
        logging.info('Starting epoch {0}'.format(progress.eidx))
        for pre_source_sents, source_sents, target_sents in text_iterator:
            #if len(source_sents[0][0]) != config.factors:
            #logging.error('Mismatch between number of factors in settings ({0}), and number in training corpus ({1})\n'.format(config.factors, len(source_sents[0][0])))
            #sys.exit(1)

            px_in, x_in, x_mask_in, y_in, y_mask_in = util.prepare_data(
                source_sents,
                target_sents,
                config.factors,
                pre_source_sents,
                maxlen=None)

            if x_in is None:
                logging.info(
                    'Minibatch with zero sample under length {0}'.format(
                        config.maxlen))
                continue

            write_summary_for_this_batch = config.summary_freq and (
                (progress.uidx % config.summary_freq == 0) or
                (config.finish_after
                 and progress.uidx % config.finish_after == 0))
            (factors, seqLen, uLen, batch_size) = x_in.shape

            loss = updater.update(sess, px_in, x_in, x_mask_in, y_in,
                                  y_mask_in, write_summary_for_this_batch)

            total_loss += loss
            n_sents += batch_size
            n_words += int(numpy.sum(y_mask_in))
            progress.uidx += 1

            if config.disp_freq and progress.uidx % config.disp_freq == 0:
                duration = time.time() - last_time
                disp_time = datetime.now().strftime('[%Y-%m-%d %H:%M:%S]')
                logging.info(
                    '{0} Epoch: {1} Update: {2} Loss/word: {3} Words/sec: {4} Sents/sec: {5}'
                    .format(disp_time, progress.eidx, progress.uidx,
                            total_loss / n_words, n_words / duration,
                            n_sents / duration))
                last_time = time.time()
                total_loss = 0.
                n_sents = 0
                n_words = 0

            if config.sample_freq and progress.uidx % config.sample_freq == 0:
                x_small, x_mask_small, y_small = x_in[:, :, :, :
                                                      10], x_mask_in[:, :, :
                                                                     10], y_in[:, :
                                                                               10]
                samples = model_set.sample(sess, x_small, x_mask_small)
                assert len(samples) == len(x_small.T) == len(
                    y_small.T), (len(samples), x_small.shape, y_small.shape)
                for xx, yy, ss in zip(x_small.T, y_small.T, samples):
                    #source = util.factoredseq2words(xx, num_to_source)
                    target = util.seq2words(yy, num_to_target)
                    sample = util.seq2words(ss, num_to_target)
                    #logging.info('SOURCE: {}'.format(source))
                    #logging.info('SOURCE: {}'.format(xx))
                    logging.info('TARGET: {}'.format(target))
                    logging.info('SAMPLE: {}'.format(sample))

            if config.beam_freq and progress.uidx % config.beam_freq == 0:
                x_small, x_mask_small, y_small = x_in[:, :, :, :
                                                      10], x_mask_in[:, :, :
                                                                     10], y_in[:, :
                                                                               10]
                samples = model_set.beam_search(
                    sess,
                    x_small,
                    x_mask_small,
                    config.beam_size,
                    normalization_alpha=config.normalization_alpha)
                # samples is a list with shape batch x beam x len
                assert len(samples) == len(x_small.T) == len(
                    y_small.T), (len(samples), x_small.shape, y_small.shape)
                for xx, yy, ss in zip(x_small.T, y_small.T, samples):
                    #source = util.factoredseq2words(xx, num_to_source)
                    target = util.seq2words(yy, num_to_target)
                    #logging.info('SOURCE: {}'.format(source))
                    logging.info('TARGET: {}'.format(target))
                    for i, (sample_seq, cost) in enumerate(ss):
                        sample = util.seq2words(sample_seq, num_to_target)
                        msg = 'SAMPLE {}: {} Cost/Len/Avg {}/{}/{}'.format(
                            i, sample, cost, len(sample), cost / len(sample))
                        logging.info(msg)

            if config.valid_freq and progress.uidx % config.valid_freq == 0:
                valid_ce = validate(sess, replicas[0], config,
                                    valid_text_iterator)
                if (len(progress.history_errs) == 0
                        or valid_ce < min(progress.history_errs)):
                    progress.history_errs.append(valid_ce)
                    progress.bad_counter = 0
                    save_non_checkpoint(sess, saver, config.saveto)
                    progress_path = '{0}.progress.json'.format(config.saveto)
                    progress.save_to_json(progress_path)
                else:
                    progress.history_errs.append(valid_ce)
                    progress.bad_counter += 1
                    if progress.bad_counter > config.patience:
                        logging.info('Early Stop!')
                        progress.estop = True
                        break
                if config.valid_script is not None:
                    score = validate_with_script(sess, replicas[0], config)
                    need_to_save = (
                        score is not None
                        and (len(progress.valid_script_scores) == 0
                             or score > max(progress.valid_script_scores)))
                    if score is None:
                        score = 0.0  # ensure a valid value is written
                    progress.valid_script_scores.append(score)
                    if need_to_save:
                        progress.bad_counter = 0
                        save_path = config.saveto + ".best-valid-script"
                        save_non_checkpoint(sess, saver, save_path)
                        write_config_to_json_file(config, save_path)

                        progress_path = '{}.progress.json'.format(save_path)
                        progress.save_to_json(progress_path)

            if config.save_freq and progress.uidx % config.save_freq == 0:
                saver.save(sess,
                           save_path=config.saveto,
                           global_step=progress.uidx)
                write_config_to_json_file(
                    config, "%s-%s" % (config.saveto, progress.uidx))

                progress_path = '{0}-{1}.progress.json'.format(
                    config.saveto, progress.uidx)
                progress.save_to_json(progress_path)

            if config.finish_after and progress.uidx % config.finish_after == 0:
                logging.info("Maximum number of updates reached")
                saver.save(sess,
                           save_path=config.saveto,
                           global_step=progress.uidx)
                write_config_to_json_file(
                    config, "%s-%s" % (config.saveto, progress.uidx))

                progress.estop = True
                progress_path = '{0}-{1}.progress.json'.format(
                    config.saveto, progress.uidx)
                progress.save_to_json(progress_path)
                break
        if progress.estop:
            break
def train(config, sess):
    assert (config.prior_model != None and (tf.train.checkpoint_exists(os.path.abspath(config.prior_model))) or (config.map_decay_c==0.0)), \
    "MAP training requires a prior model file: Use command-line option --prior_model"

    # Construct the graph, with one model replica per GPU

    num_gpus = len(util.get_available_gpus())
    num_replicas = max(1, num_gpus)

    logging.info('Building model...')
    replicas = []
    for i in range(num_replicas):
        device_type = "GPU" if num_gpus > 0 else "CPU"
        device_spec = tf.DeviceSpec(device_type=device_type, device_index=i)
        with tf.device(device_spec):
            with tf.variable_scope(tf.get_variable_scope(), reuse=(i > 0)):
                if config.model_type == "transformer":
                    model = TransformerModel(config)
                else:
                    model = rnn_model.RNNModel(config)
                replicas.append(model)

    init = tf.zeros_initializer(dtype=tf.int32)
    global_step = tf.get_variable('time', [],
                                  initializer=init,
                                  trainable=False)

    if config.learning_schedule == "constant":
        schedule = ConstantSchedule(config.learning_rate)
    elif config.learning_schedule == "transformer":
        schedule = TransformerSchedule(global_step=global_step,
                                       dim=config.state_size,
                                       warmup_steps=config.warmup_steps)
    else:
        logging.error('Learning schedule type is not valid: {}'.format(
            config.learning_schedule))
        sys.exit(1)

    if config.optimizer == 'adam':
        optimizer = tf.train.AdamOptimizer(
            learning_rate=schedule.learning_rate,
            beta1=config.adam_beta1,
            beta2=config.adam_beta2,
            epsilon=config.adam_epsilon)
    else:
        logging.error('No valid optimizer defined: {}'.format(
            config.optimizer))
        sys.exit(1)

    if config.summary_freq:
        summary_dir = (config.summary_dir if config.summary_dir is not None
                       else os.path.abspath(os.path.dirname(config.saveto)))
        writer = tf.summary.FileWriter(summary_dir, sess.graph)
    else:
        writer = None

    updater = ModelUpdater(config, num_gpus, replicas, optimizer, global_step,
                           writer)

    saver, progress = model_loader.init_or_restore_variables(config,
                                                             sess,
                                                             train=True)

    global_step.load(progress.uidx, sess)

    # Use an InferenceModelSet to abstract over model types for sampling and
    # beam search. Multi-GPU sampling and beam search are not currently
    # supported, so we just use the first replica.
    model_set = inference.InferenceModelSet([replicas[0]], [config])

    #save model options
    write_config_to_json_file(config, config.saveto)

    text_iterator, valid_text_iterator = load_data(config)
    _, _, num_to_source, num_to_target = util.load_dictionaries(config)
    total_loss = 0.
    n_sents, n_words = 0, 0
    last_time = time.time()
    logging.info("Initial uidx={}".format(progress.uidx))
    for progress.eidx in range(progress.eidx, config.max_epochs):
        logging.info('Starting epoch {0}'.format(progress.eidx))
        for source_sents, target_sents in text_iterator:
            if len(source_sents[0][0]) != config.factors:
                logging.error(
                    'Mismatch between number of factors in settings ({0}), and number in training corpus ({1})\n'
                    .format(config.factors, len(source_sents[0][0])))
                sys.exit(1)
            x_in, x_mask_in, y_in, y_mask_in = util.prepare_data(
                source_sents, target_sents, config.factors, maxlen=None)
            if x_in is None:
                logging.info(
                    'Minibatch with zero sample under length {0}'.format(
                        config.maxlen))
                continue
            write_summary_for_this_batch = config.summary_freq and (
                (progress.uidx % config.summary_freq == 0) or
                (config.finish_after
                 and progress.uidx % config.finish_after == 0))
            (factors, seqLen, batch_size) = x_in.shape

            loss = updater.update(sess, x_in, x_mask_in, y_in, y_mask_in,
                                  write_summary_for_this_batch)
            total_loss += loss
            n_sents += batch_size
            n_words += int(numpy.sum(y_mask_in))
            progress.uidx += 1

            if config.disp_freq and progress.uidx % config.disp_freq == 0:
                duration = time.time() - last_time
                disp_time = datetime.now().strftime('[%Y-%m-%d %H:%M:%S]')
                logging.info(
                    '{0} Epoch: {1} Update: {2} Loss/word: {3} Words/sec: {4} Sents/sec: {5}'
                    .format(disp_time, progress.eidx, progress.uidx,
                            total_loss / n_words, n_words / duration,
                            n_sents / duration))
                last_time = time.time()
                total_loss = 0.
                n_sents = 0
                n_words = 0

            if config.sample_freq and progress.uidx % config.sample_freq == 0:
                x_small, x_mask_small, y_small = x_in[:, :, :
                                                      10], x_mask_in[:, :
                                                                     10], y_in[:, :
                                                                               10]
                samples = model_set.sample(sess, x_small, x_mask_small)
                assert len(samples) == len(x_small.T) == len(
                    y_small.T), (len(samples), x_small.shape, y_small.shape)
                for xx, yy, ss in zip(x_small.T, y_small.T, samples):
                    source = util.factoredseq2words(xx, num_to_source)
                    target = util.seq2words(yy, num_to_target)
                    sample = util.seq2words(ss, num_to_target)
                    logging.info('SOURCE: {}'.format(source))
                    logging.info('TARGET: {}'.format(target))
                    logging.info('SAMPLE: {}'.format(sample))

            if config.beam_freq and progress.uidx % config.beam_freq == 0:
                x_small, x_mask_small, y_small = x_in[:, :, :
                                                      10], x_mask_in[:, :
                                                                     10], y_in[:, :
                                                                               10]
                samples = model_set.beam_search(
                    sess,
                    x_small,
                    x_mask_small,
                    config.beam_size,
                    normalization_alpha=config.normalization_alpha)
                # samples is a list with shape batch x beam x len
                assert len(samples) == len(x_small.T) == len(
                    y_small.T), (len(samples), x_small.shape, y_small.shape)
                for xx, yy, ss in zip(x_small.T, y_small.T, samples):
                    source = util.factoredseq2words(xx, num_to_source)
                    target = util.seq2words(yy, num_to_target)
                    logging.info('SOURCE: {}'.format(source))
                    logging.info('TARGET: {}'.format(target))
                    for i, (sample_seq, cost) in enumerate(ss):
                        sample = util.seq2words(sample_seq, num_to_target)
                        msg = 'SAMPLE {}: {} Cost/Len/Avg {}/{}/{}'.format(
                            i, sample, cost, len(sample), cost / len(sample))
                        logging.info(msg)

            if config.valid_freq and progress.uidx % config.valid_freq == 0:
                valid_ce = validate(sess, replicas[0], config,
                                    valid_text_iterator)
                if (len(progress.history_errs) == 0
                        or valid_ce < min(progress.history_errs)):
                    progress.history_errs.append(valid_ce)
                    progress.bad_counter = 0
                    save_non_checkpoint(sess, saver, config.saveto)
                    progress_path = '{0}.progress.json'.format(config.saveto)
                    progress.save_to_json(progress_path)
                else:
                    progress.history_errs.append(valid_ce)
                    progress.bad_counter += 1
                    if progress.bad_counter > config.patience:
                        logging.info('Early Stop!')
                        progress.estop = True
                        break
                if config.valid_script is not None:
                    score = validate_with_script(sess, replicas[0], config)
                    need_to_save = (
                        score is not None
                        and (len(progress.valid_script_scores) == 0
                             or score > max(progress.valid_script_scores)))
                    if score is None:
                        score = 0.0  # ensure a valid value is written
                    progress.valid_script_scores.append(score)
                    if need_to_save:
                        save_path = config.saveto + ".best-valid-script"
                        save_non_checkpoint(sess, saver, save_path)
                        write_config_to_json_file(config, save_path)

                        progress_path = '{}.progress.json'.format(save_path)
                        progress.save_to_json(progress_path)

            if config.save_freq and progress.uidx % config.save_freq == 0:
                saver.save(sess,
                           save_path=config.saveto,
                           global_step=progress.uidx)
                write_config_to_json_file(
                    config, "%s-%s" % (config.saveto, progress.uidx))

                progress_path = '{0}-{1}.progress.json'.format(
                    config.saveto, progress.uidx)
                progress.save_to_json(progress_path)

            if config.finish_after and progress.uidx % config.finish_after == 0:
                logging.info("Maximum number of updates reached")
                saver.save(sess,
                           save_path=config.saveto,
                           global_step=progress.uidx)
                write_config_to_json_file(
                    config, "%s-%s" % (config.saveto, progress.uidx))

                progress.estop = True
                progress_path = '{0}-{1}.progress.json'.format(
                    config.saveto, progress.uidx)
                progress.save_to_json(progress_path)
                break
        if progress.estop:
            break
Exemple #31
0
def max_sim(simulation, arguments, group):
    dataset = [simulation(*arguments) for i in range(NUM_SIM)]
    data = prepare_data(dataset, group)
    _, _, _, _, _, m = merge_curves(data, len(dataset), 1)
    return m
Exemple #32
0
from util import get_data

df = get_data()
from util import prepare_data

cat_df_list = list(df.select_dtypes(include=['object']))
num_df_list = list(df.select_dtypes(include=['float64', 'int64']))
X = df[num_df_list]
range_n_clusters = [2, 3, 4, 5, 6]
range_n_clusters = [4, 5, 6]
range_n_clusters = [2, 3]
from sklearn.preprocessing import StandardScaler

y = df["readmitted"]
X = prepare_data(df)
X.drop("readmitted", inplace=True, axis=1)
scaler = StandardScaler()
X = StandardScaler().fit_transform(X)

from imblearn.under_sampling import (RandomUnderSampler, ClusterCentroids,
                                     TomekLinks, NeighbourhoodCleaningRule,
                                     NearMiss)

sampler = NearMiss(n_jobs=2)
X_rs, y_rs = sampler.fit_sample(X, y)

from sklearn.manifold import TSNE

transformer = TSNE(n_components=7)
X_std = transformer.fit_transform(X_rs)
import numpy as np
import helpers
import config as cfg
import matplotlib.pyplot as plt

K.tensorflow_backend._get_available_gpus()

# this needs to get generalized
class_names_list, label_values = helpers.get_label_info(
    os.path.join("CamVid", "class_dict.csv"))

num_classes = len(label_values)

# Load the data
print("Loading the data ...")
train_input_names, train_output_names, val_input_names, val_output_names, test_input_names, test_output_names = util.prepare_data(
    cfg.DATASET_DIR)

input_data = []
output_labels = []
val_data = []
val_labels = []

for img_name in train_input_names:
    input_image = util.load_image(img_name)
    with tf.device('/cpu:0'):
        input_image = np.float32(input_image) / 255.0

        #input_data.append(np.expand_dims(input_image, axis=0))
        input_data.append(input_image)
        print(img_name)