Ejemplo n.º 1
0
        data.y = [item for sublist in data.y for item in sublist]
        assert len(data.x[0]) == len(data.y)
    
    # save the data for cnn since it takes forever to generate
    # also save the concept dict order for faster prediction
    concept_order = uniq(concept.ids)
    data = [tr_data,val_data,concept_order]
    with open('gitig_new_data.pickle','wb') as f:
        pickle.dump(data,f,protocol=4)
    logger.info('Mentions and concepts saved.')


# cnn
if not int(config['model']['use_saved_model']):    # train new model
    import cnn, model_tools
    cnn.print_input(tr_data)
    model = cnn.build_model(config,tr_data,vocabulary,pretrained)


    # select hardest training samples from preliminary training
    if config.getint('training','sample_hard'):
        import sp_training
        from datetime import datetime
        # from callback import EarlyStoppingRankingAccuracy
        # evaluation_function_1 = EarlyStoppingRankingAccuracy(config,val_data)
        from callback import EarlyStoppingRankingAccuracySpedUp
        evaluation_function = EarlyStoppingRankingAccuracySpedUp(config,val_data,concept.padded,corpus_dev.padded,pretrained)
        
        try:
            new_tr_data = pickle.load(open('gitig_new_tr_data_ratio.pickle','rb'))
            logger.info('Using saved subsampled data')
Ejemplo n.º 2
0
        import pickle
        data = [corpus_train.elmo, corpus_dev.elmo, can_list.elmo]
        with open(config['embedding']['cache_elmo_emb'], 'wb') as f:
            pickle.dump(data, f, protocol=4)
        logger.info('Elmo embedding for mentions and candidates saved.')
    for corpus, data in zip([corpus_train, corpus_dev],
                            [training_data, val_data]):
        data.x.extend(
            np.array(vectorizer_elmo.elmo_format_x(corpus.elmo,
                                                   can_list.elmo)))

if not int(config['model']['use_saved_model']):  # train new model
    import cnn, model_tools
    from callback import EarlyStoppingRankingAccuracy
    evaluation_function = EarlyStoppingRankingAccuracy(config, val_data)
    cnn.print_input(training_data)
    model = cnn.build_model(config, training_data, vocabulary, pretrained)

    if int(config['settings']['imp_tr']):
        #importance sampling
        from importance_sampling.training import ImportanceTraining
        logger.warning('Using truncated data!')
        fake_data_x = [a[:1000000] for a in training_data.x]
        hist = ImportanceTraining(model).fit(fake_data_x,
                                             training_data.y[:1000000],
                                             epochs=int(
                                                 config['training']['epoch']),
                                             batch_size=100,
                                             callbacks=[evaluation_function])
    else:
        logger.warning('Using truncated data!')