parser.add_argument('--top100_labels', action='store_true', default=False) parser.add_argument('--for_rnn', action='store_true', default=False) args = parser.parse_args() db = DatabaseManager() start = datetime.datetime.now() time_str = start.strftime("%m%d_%H%M%S") config = vars(args) experiment_id = db.bag_of_words_generator_experiment_create(config, start) log_filename = '{}_bag_of_words_generator.log'.format(experiment_id) db.bag_of_words_generator_experiment_insert_log_file( experiment_id, log_filename) logger = logging_utils.build_logger(log_filename).getLogger( 'bag_of_words_generator') logger.info('Program start, bag of words generator experiment id = %s', experiment_id) logger.info(config) vocabulary = db.load_vocabulary(args.vocabulary_experiment) logger.info('Vocabulary loaded') logger.info('Vocabulary length = %s', len(vocabulary)) table_name = get_table_name(args, experiment_id) # Get the corpus and prepare the bag of words generator. db = DatabaseManager() subject_ids, corpus, chart_dates = db.get_corpus( toy_set=args.toy_set, top100_labels=args.top100_labels,
args = parser.parse_args() db = DatabaseManager() start = datetime.datetime.now() time_str = start.strftime("%m%d_%H%M%S") config = vars(args) experiment_id = db.classifier_experiment_create(config, start, 'logistic_regression', args.train_table_name, None, args.test_table_name) log_filename = '{}_logistic_regression.log'.format(experiment_id) db.classifier_experiment_insert_log_file(experiment_id, log_filename) logger = logging_utils.build_logger(log_filename).getLogger( 'logistic_regression') logger.info('Program start, classifier experiment id = %s', experiment_id) logger.info(args) X_train, Y_train = load_X_Y( args.train_table_name, top100_labels=args.top100_labels, normalize_by_npatients=(False if args.dont_normalize else True)) n_features = X_train.shape[ 1] # TODO This is correct but it would be nicer if we knew the vocabulary length beforehand and provided it to load_X_Y() logger.info('X_train, Y_train loaded') classifiers = train_classifiers(X_train, Y_train) logger.info('Building result matrix for training set') number_of_patients_training_set = Y_train[0].shape[0]
args = parser.parse_args() db = DatabaseManager() start = datetime.datetime.now() time_str = start.strftime("%m%d_%H%M%S") config = vars(args) experiment_id = db.classifier_experiment_create(config, start, 'nnff', args.train_table_name, args.val_table_name, args.test_table_name) log_filename = '{}_nnff.log'.format(experiment_id) db.classifier_experiment_insert_log_file(experiment_id, log_filename) logger = logging_utils.build_logger(log_filename).getLogger('feed_forward') logger.info('Program start, classifier experiment id = %s', experiment_id) logger.info(args) X_train, Y_train = tensor_loader.load_X_Y(logger, args.train_table_name, args.no_gpu) X_val, Y_val = tensor_loader.load_X_Y(logger, args.val_table_name, args.no_gpu, validation_set=True) N, D_in = X_train.shape # Number of samples, number of features. if args.top100_labels: # Dimension of the first and second hidden layers, and dimension of the output vector. H1, H2, D_out = 1000, 1000, 100 else: H1, H2, D_out = 300, 100, 10
nargs='?', const=700, help='how many rows to fetch from the corpus table') parser.add_argument('--top100_labels', action='store_true', default=False) args = parser.parse_args() db = DatabaseManager() start = datetime.datetime.now() time_str = start.strftime("%m%d_%H%M%S") config = vars(args) experiment_id = db.vocabulary_experiment_create(config, start) log_filename = '{}_vocabulary_generator.log'.format(experiment_id) db.vocabulary_experiment_insert_log_file(experiment_id, log_filename) logger = logging_utils.build_logger(log_filename).getLogger( 'vocabulary_generator') logger.info('Program start, vocabulary experiment id = %s', experiment_id) logger.info(config) _, corpus, _ = db.get_corpus(toy_set=args.toy_set, top100_labels=args.top100_labels) vocabulary_generator = VocabularyGenerator(corpus, logger) vocabulary = vocabulary_generator.build_vocabulary() end = datetime.datetime.now() db.vocabulary_experiment_insert_vocabulary(experiment_id, end, vocabulary) logger.info('Vocabulary inserted into database')
parser.add_argument('test_table_name') parser.add_argument('--top100_labels', action='store_true', default=False) parser.add_argument('--no_gpu', action='store_true', default=False) args = parser.parse_args() db = DatabaseManager() start = datetime.datetime.now() time_str = start.strftime("%m%d_%H%M%S") config = vars(args) experiment_id = db.classifier_experiment_create(config, start, 'rnn', args.train_table_name, args.val_table_name, args.test_table_name) log_filename = '{}_rnn.log'.format(experiment_id) db.classifier_experiment_insert_log_file(experiment_id, log_filename) logger = logging_utils.build_logger(log_filename).getLogger('rnn') logger.info('Program start, classifier experiment id = %s', experiment_id) logger.info(args) # We can't fit all of the notes into memory. Split the patients into chunks. # Ensure 1 < (number of patients / total_chunks). total_chunks = 20 # TODO move to program args. # Load the first chunk to get number of input features. X_train, Y_train = tensor_loader.load_X_Y_rnn(logger, args.train_table_name, chunk=0, total_chunks=total_chunks, no_gpu=args.no_gpu) X_val, Y_val = tensor_loader.load_X_Y_rnn(logger, args.val_table_name, chunk=0, total_chunks=total_chunks, no_gpu=args.no_gpu, validation_set=True) N, seq_length, D_in = X_train.shape # Number of samples, sequence length, number of features. if args.top100_labels: # Dimension of the hidden units, and dimension of the output vector. H, D_out = 1000, 100 else: