parser.add_argument('--top100_labels', action='store_true', default=False)
    parser.add_argument('--for_rnn', action='store_true', default=False)
    args = parser.parse_args()

    db = DatabaseManager()

    start = datetime.datetime.now()
    time_str = start.strftime("%m%d_%H%M%S")
    config = vars(args)
    experiment_id = db.bag_of_words_generator_experiment_create(config, start)

    log_filename = '{}_bag_of_words_generator.log'.format(experiment_id)
    db.bag_of_words_generator_experiment_insert_log_file(
        experiment_id, log_filename)

    logger = logging_utils.build_logger(log_filename).getLogger(
        'bag_of_words_generator')
    logger.info('Program start, bag of words generator experiment id = %s',
                experiment_id)
    logger.info(config)

    vocabulary = db.load_vocabulary(args.vocabulary_experiment)
    logger.info('Vocabulary loaded')
    logger.info('Vocabulary length = %s', len(vocabulary))

    table_name = get_table_name(args, experiment_id)

    # Get the corpus and prepare the bag of words generator.
    db = DatabaseManager()
    subject_ids, corpus, chart_dates = db.get_corpus(
        toy_set=args.toy_set,
        top100_labels=args.top100_labels,
Beispiel #2
0
    args = parser.parse_args()

    db = DatabaseManager()

    start = datetime.datetime.now()
    time_str = start.strftime("%m%d_%H%M%S")
    config = vars(args)
    experiment_id = db.classifier_experiment_create(config, start,
                                                    'logistic_regression',
                                                    args.train_table_name,
                                                    None, args.test_table_name)

    log_filename = '{}_logistic_regression.log'.format(experiment_id)
    db.classifier_experiment_insert_log_file(experiment_id, log_filename)

    logger = logging_utils.build_logger(log_filename).getLogger(
        'logistic_regression')
    logger.info('Program start, classifier experiment id = %s', experiment_id)
    logger.info(args)

    X_train, Y_train = load_X_Y(
        args.train_table_name,
        top100_labels=args.top100_labels,
        normalize_by_npatients=(False if args.dont_normalize else True))
    n_features = X_train.shape[
        1]  # TODO This is correct but it would be nicer if we knew the vocabulary length beforehand and provided it to load_X_Y()
    logger.info('X_train, Y_train loaded')

    classifiers = train_classifiers(X_train, Y_train)

    logger.info('Building result matrix for training set')
    number_of_patients_training_set = Y_train[0].shape[0]
    args = parser.parse_args()

    db = DatabaseManager()

    start = datetime.datetime.now()
    time_str = start.strftime("%m%d_%H%M%S")
    config = vars(args)
    experiment_id = db.classifier_experiment_create(config, start, 'nnff',
                                                    args.train_table_name,
                                                    args.val_table_name,
                                                    args.test_table_name)

    log_filename = '{}_nnff.log'.format(experiment_id)
    db.classifier_experiment_insert_log_file(experiment_id, log_filename)

    logger = logging_utils.build_logger(log_filename).getLogger('feed_forward')
    logger.info('Program start, classifier experiment id = %s', experiment_id)
    logger.info(args)

    X_train, Y_train = tensor_loader.load_X_Y(logger, args.train_table_name,
                                              args.no_gpu)
    X_val, Y_val = tensor_loader.load_X_Y(logger,
                                          args.val_table_name,
                                          args.no_gpu,
                                          validation_set=True)

    N, D_in = X_train.shape  # Number of samples, number of features.
    if args.top100_labels:  # Dimension of the first and second hidden layers, and dimension of the output vector.
        H1, H2, D_out = 1000, 1000, 100
    else:
        H1, H2, D_out = 300, 100, 10
Beispiel #4
0
                        nargs='?',
                        const=700,
                        help='how many rows to fetch from the corpus table')
    parser.add_argument('--top100_labels', action='store_true', default=False)
    args = parser.parse_args()

    db = DatabaseManager()

    start = datetime.datetime.now()
    time_str = start.strftime("%m%d_%H%M%S")
    config = vars(args)
    experiment_id = db.vocabulary_experiment_create(config, start)

    log_filename = '{}_vocabulary_generator.log'.format(experiment_id)
    db.vocabulary_experiment_insert_log_file(experiment_id, log_filename)

    logger = logging_utils.build_logger(log_filename).getLogger(
        'vocabulary_generator')
    logger.info('Program start, vocabulary experiment id = %s', experiment_id)
    logger.info(config)

    _, corpus, _ = db.get_corpus(toy_set=args.toy_set,
                                 top100_labels=args.top100_labels)

    vocabulary_generator = VocabularyGenerator(corpus, logger)
    vocabulary = vocabulary_generator.build_vocabulary()

    end = datetime.datetime.now()
    db.vocabulary_experiment_insert_vocabulary(experiment_id, end, vocabulary)
    logger.info('Vocabulary inserted into database')
Beispiel #5
0
    parser.add_argument('test_table_name')
    parser.add_argument('--top100_labels', action='store_true', default=False)
    parser.add_argument('--no_gpu', action='store_true', default=False)
    args = parser.parse_args()

    db = DatabaseManager()

    start = datetime.datetime.now()
    time_str = start.strftime("%m%d_%H%M%S")
    config = vars(args)
    experiment_id = db.classifier_experiment_create(config, start, 'rnn', args.train_table_name, args.val_table_name, args.test_table_name)

    log_filename = '{}_rnn.log'.format(experiment_id)
    db.classifier_experiment_insert_log_file(experiment_id, log_filename)

    logger = logging_utils.build_logger(log_filename).getLogger('rnn')
    logger.info('Program start, classifier experiment id = %s', experiment_id)
    logger.info(args)

    # We can't fit all of the notes into memory. Split the patients into chunks.
    # Ensure 1 < (number of patients / total_chunks).
    total_chunks = 20  # TODO move to program args.

    # Load the first chunk to get number of input features.
    X_train, Y_train = tensor_loader.load_X_Y_rnn(logger, args.train_table_name, chunk=0, total_chunks=total_chunks, no_gpu=args.no_gpu)
    X_val, Y_val = tensor_loader.load_X_Y_rnn(logger, args.val_table_name, chunk=0, total_chunks=total_chunks, no_gpu=args.no_gpu, validation_set=True)

    N, seq_length, D_in = X_train.shape  # Number of samples, sequence length, number of features.
    if args.top100_labels:  # Dimension of the hidden units, and dimension of the output vector.
        H, D_out = 1000, 100
    else: