def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required arguments: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]

    print("Reading data...")
    Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(working_dir)

    Y_array = np.array(Y)
    # print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape)))

    num_examples, dimension = X_array.shape
    num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet)
    num_y_examples = len(Y)

    assert num_examples == num_y_examples

    Y_adj, indices = ctk_io.flatten_outputs(Y_array)

    train_x, valid_x, train_y, valid_y = train_test_split(X_array, Y_array, test_size=0.2, random_state=18)
    optim = RandomSearch(
        lambda: get_random_config(),
        lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y, len(feature_alphabet), num_outputs),
    )
    best_config = optim.optimize()

    print("Best config: %s" % best_config)
Beispiel #2
0
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required arguments: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]

    print("Reading data...")
    Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(
        working_dir)

    Y_array = np.array(Y)
    #print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape)))

    num_examples, dimension = X_array.shape
    num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet)
    num_y_examples = len(Y)

    assert num_examples == num_y_examples

    Y_adj, indices = ctk_io.flatten_outputs(Y_array)

    train_x, valid_x, train_y, valid_y = train_test_split(X_array,
                                                          Y_array,
                                                          test_size=0.2,
                                                          random_state=18)
    optim = RandomSearch(
        lambda: get_random_config(),
        lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y,
                                  len(feature_alphabet), num_outputs))
    best_config = optim.optimize()

    print("Best config: %s" % best_config)
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required arguments: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]

    print("Reading data...")
    Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(working_dir)
    
    X_segments, dimensions = split_entity_data(X_array, feature_alphabet)
    Y_array = np.array(Y)
    Y_adj, indices = ctk_io.flatten_outputs(Y_array)
    
    num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet)
    num_y_examples = len(Y)
    
    train_x0, valid_x0, train_x1, valid_x1, train_x2, valid_x2, train_y, valid_y = train_test_split(X_segments[0], X_segments[1], X_segments[2], Y_array, test_size=0.2, random_state=18)
    train_x = [train_x0, train_x1, train_x2]
    valid_x = [valid_x0, valid_x1, valid_x2]
    
    optim = RandomSearch(lambda: get_random_config(), lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y, len(feature_alphabet), num_outputs ) )
    best_config = optim.optimize()

    print("Best config: %s" % best_config)
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required arguments: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]

    print("Reading data...")
    Y, label_alphabet, X_array, feature_alphabet = ctk_io.read_token_sequence_data(
        working_dir)

    X_segments, dimensions = split_entity_data(X_array, feature_alphabet)
    Y_array = np.array(Y)
    Y_adj, indices = ctk_io.flatten_outputs(Y_array)

    num_outputs = 1 if len(label_alphabet) == 2 else len(label_alphabet)
    num_y_examples = len(Y)

    train_x0, valid_x0, train_x1, valid_x1, train_x2, valid_x2, train_y, valid_y = train_test_split(
        X_segments[0],
        X_segments[1],
        X_segments[2],
        Y_array,
        test_size=0.2,
        random_state=18)
    train_x = [train_x0, train_x1, train_x2]
    valid_x = [valid_x0, valid_x1, valid_x2]

    optim = RandomSearch(
        lambda: get_random_config(),
        lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y,
                                  len(feature_alphabet), num_outputs))
    best_config = optim.optimize()

    print("Best config: %s" % best_config)
def main(args):
    #np.random.seed(1337)
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <data directory>\n")
        sys.exit(-1)
    working_dir = args[0]
    data_file = os.path.join(working_dir, 'training-data.liblinear')

    # learn alphabet from training data
    provider = dataset.DatasetProvider(data_file)
    # now load training examples and labels
    train_x, train_y = provider.load(data_file)
    # turn x and y into numpy array among other things
    maxlen = max([len(seq) for seq in train_x])
    classes = len(set(train_y))

    train_x = pad_sequences(train_x, maxlen=maxlen)
    train_y = to_categorical(np.array(train_y), classes)

    #loading pre-trained embedding file:
    embeddings_index = {}
    f = open(os.path.join(working_dir, 'mimic.txt'))
    values = f.readline().split()
    EMBEDDING_WORDNUM = int(values[0])
    EMBEDDING_DIM = int(values[1])
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('load embeddings for %s=%s words.' %
          (len(embeddings_index), EMBEDDING_WORDNUM))

    # prepare embedding matrix
    nb_words = len(provider.word2int)
    embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
    for word, i in provider.word2int.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:  # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    print 'train_x shape:', train_x.shape
    print 'train_y shape:', train_y.shape

    #train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.1, random_state=18)

    optim = RandomSearch(
        lambda: get_random_config(), lambda x, y: run_one_eval(
            x, y, train_x, train_y, maxlen, len(provider.word2int), classes,
            embedding_matrix, EMBEDDING_DIM))
    best_config = optim.optimize()

    print("Best config: %s" % best_config)

    sys.exit(0)
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]
   
    print("Reading data...")
    Y, outcome_map, outcome_list, X, feature_alphabet = ctk_io.read_multitask_token_sequence_data(working_dir)
    start_ind = feature_alphabet[start_symbol]
    end_ind = feature_alphabet[end_symbol]
    
    train_x, valid_x, train_y, valid_y = train_test_split(X, Y, test_size=0.2, random_state=7)

#    X_distance = get_distance_features(X, start_ind, end_ind)
    
    print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape)))
    
    num_examples, dimension = X.shape
    num_y_examples, num_labels = Y.shape
    assert num_examples == num_y_examples
    
    weights = None
    if len(args) > 1:
        weights = ctk_io.read_embeddings(args[1], feats_alphabet)
    
    train_y_adj, train_indices = ctk_io.flatten_outputs(train_y)
    valid_y_adj, valid_indices = ctk_io.flatten_outputs(valid_y)
    if not train_indices == valid_indices:
        print("Error: training and valid sets have different index sets -- may be missing some labels in one set or the other")
        sys.exit(-1)
           
    output_dims_list = []
    train_y_list = []
    valid_y_list = []
    indices = train_indices
    for i in range(len(indices)-1):
        label_dims = indices[i+1] - indices[i]
        output_dims_list.append(label_dims)
        if label_dims == 1:
            train_y_list.append(train_y_adj[:, indices[i]])
            valid_y_list.append(valid_y_adj[:, indices[i]])
        else:
            train_y_list.append(train_y_adj[:, indices[i]:indices[i+1]])
            valid_y_list.append(valid_y_adj[:, indices[i]:indices[i+1]])
        
        print("Dimensions of label %d are %s" % (i, str(train_y_list[-1].shape) ) )
    
    ## pass a function to the search that it uses to get a random config
    ## and a function that it will get an eval given (e)pochs and (c)onfig file:
    optim = RandomSearch(lambda: get_random_config(weights), lambda e, c: run_one_eval(e, c, train_x, train_y_list, valid_x, valid_y_list, len(feature_alphabet), output_dims_list, weights ) )
    best_config = optim.optimize(max_iter=27)

    open(os.path.join(working_dir, 'model_0.config'), 'w').write( str(best_config) )
    print("Best config returned by optimizer is %s" % str(best_config) )
def main(args):
    
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <data directory> [(optional) weights file]\n")
        sys.exit(-1)

    working_dir = args[0]
    
    (labels, label_alphabet, feats, feats_alphabet) = ctk_io.read_bio_sequence_data(working_dir)
    
    weights = None
    if len(args) > 1:
        weights = ctk_io.read_embeddings(args[1], feats_alphabet)
        
    maxlen = max([len(seq) for seq in feats])
    all_x = pad_sequences(feats, maxlen=maxlen)
    all_y = ctk_io.expand_labels(pad_sequences(labels, maxlen=maxlen), label_alphabet)

    train_x, valid_x, train_y, valid_y = train_test_split(all_x, all_y, test_size=0.2, random_state=7)
    
    optim = RandomSearch(lambda: get_random_config(weights), lambda x, y: run_one_eval(x, y, train_x, train_y, valid_x, valid_y, len(feats_alphabet), len(label_alphabet), weights ) )
    best_config = optim.optimize()
    
    open(os.path.join(working_dir, 'model_0.config'), 'w').write( str(best_config) )
    print("Best config returned by optimizer is %s" % str(best_config) )
    
    if not best_config['pretrain']:
        weights = None
        
    model = get_model_for_config(train_x.shape, len(feats_alphabet), len(label_alphabet), best_config, weights=weights)

    model.fit(all_x,
            all_y,
            nb_epoch=40,
            batch_size=best_config['batch_size'],
            verbose=1,
            validation_split=0.1)

    model.summary()
    
    json_string = model.to_json()
    open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
    model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)
    
    fn = open(os.path.join(working_dir, 'alphabets.pkl'), 'w')
    pickle.dump( (feats_alphabet, label_alphabet), fn)
    fn.close()

    with ZipFile(os.path.join(working_dir, 'script.model'), 'w') as myzip:
        myzip.write(os.path.join(working_dir, 'model_0.json'), 'model_0.json')
        myzip.write(os.path.join(working_dir, 'model_0.h5'), 'model_0.h5')
        myzip.write(os.path.join(working_dir, 'alphabets.pkl'), 'alphabets.pkl')
Beispiel #8
0
    base = os.environ['DATA_ROOT']
    train_dir = os.path.join(base, cfg.get('data', 'train'))
    code_file = os.path.join(base, cfg.get('data', 'codes'))

    provider = dataset.DatasetProvider(train_dir,
                                       code_file,
                                       cfg.getint('args', 'min_token_freq'),
                                       cfg.getint('args',
                                                  'max_tokens_in_file'),
                                       cfg.getint('args',
                                                  'min_examples_per_code'),
                                       use_cuis=False)
    x, y = provider.load(tokens_as_set=False)

    maxlen = max([len(seq) for seq in x])
    x = pad_sequences(x, maxlen=maxlen)
    y = np.array(y)

    print('x shape:', x.shape)
    print('y shape:', y.shape)
    print('max seq len:', maxlen)
    print('vocab size:', x.max() + 1)
    print('number of features:', len(provider.token2int))
    print('number of labels:', len(provider.code2int))

    model = CnnCodePredictionModel()
    search = RandomSearch(model, x, y)
    best_config = search.optimize(max_iter=64)
    print('best config:', best_config)
def main(args):

    if len(args) < 1:
        sys.stderr.write(
            "Error - one required argument: <data directory> [(optional) weights file]\n"
        )
        sys.exit(-1)

    working_dir = args[0]

    (labels, label_alphabet, feats,
     feats_alphabet) = ctk_io.read_bio_sequence_data(working_dir)

    weights = None
    if len(args) > 1:
        weights = ctk_io.read_embeddings(args[1], feats_alphabet)

    maxlen = max([len(seq) for seq in feats])
    all_x = pad_sequences(feats, maxlen=maxlen)
    all_y = ctk_io.expand_labels(pad_sequences(labels, maxlen=maxlen),
                                 label_alphabet)

    train_x, valid_x, train_y, valid_y = train_test_split(all_x,
                                                          all_y,
                                                          test_size=0.2,
                                                          random_state=7)

    optim = RandomSearch(
        lambda: get_random_config(weights), lambda x, y: run_one_eval(
            x, y, train_x, train_y, valid_x, valid_y, len(feats_alphabet),
            len(label_alphabet), weights))
    best_config = optim.optimize()

    open(os.path.join(working_dir, 'model_0.config'),
         'w').write(str(best_config))
    print("Best config returned by optimizer is %s" % str(best_config))

    if not best_config['pretrain']:
        weights = None

    model = get_model_for_config(train_x.shape,
                                 len(feats_alphabet),
                                 len(label_alphabet),
                                 best_config,
                                 weights=weights)

    model.fit(all_x,
              all_y,
              nb_epoch=40,
              batch_size=best_config['batch_size'],
              verbose=1,
              validation_split=0.1)

    model.summary()

    json_string = model.to_json()
    open(os.path.join(working_dir, 'model_0.json'), 'w').write(json_string)
    model.save_weights(os.path.join(working_dir, 'model_0.h5'), overwrite=True)

    fn = open(os.path.join(working_dir, 'alphabets.pkl'), 'w')
    pickle.dump((feats_alphabet, label_alphabet), fn)
    fn.close()

    with ZipFile(os.path.join(working_dir, 'script.model'), 'w') as myzip:
        myzip.write(os.path.join(working_dir, 'model_0.json'), 'model_0.json')
        myzip.write(os.path.join(working_dir, 'model_0.h5'), 'model_0.h5')
        myzip.write(os.path.join(working_dir, 'alphabets.pkl'),
                    'alphabets.pkl')
Beispiel #10
0
def main(args):
    if len(args) < 1:
        sys.stderr.write("Error - one required argument: <data directory>\n")
        sys.exit(-1)

    working_dir = args[0]

    print("Reading data...")
    Y, outcome_map, outcome_list, X, feature_alphabet = ctk_io.read_multitask_token_sequence_data(
        working_dir)
    start_ind = feature_alphabet[start_symbol]
    end_ind = feature_alphabet[end_symbol]

    train_x, valid_x, train_y, valid_y = train_test_split(X,
                                                          Y,
                                                          test_size=0.2,
                                                          random_state=7)

    #    X_distance = get_distance_features(X, start_ind, end_ind)

    print("Shape of X is %s and Y is %s" % (str(X.shape), str(Y.shape)))

    num_examples, dimension = X.shape
    num_y_examples, num_labels = Y.shape
    assert num_examples == num_y_examples

    weights = None
    if len(args) > 1:
        weights = ctk_io.read_embeddings(args[1], feats_alphabet)

    train_y_adj, train_indices = ctk_io.flatten_outputs(train_y)
    valid_y_adj, valid_indices = ctk_io.flatten_outputs(valid_y)
    if not train_indices == valid_indices:
        print(
            "Error: training and valid sets have different index sets -- may be missing some labels in one set or the other"
        )
        sys.exit(-1)

    output_dims_list = []
    train_y_list = []
    valid_y_list = []
    indices = train_indices
    for i in range(len(indices) - 1):
        label_dims = indices[i + 1] - indices[i]
        output_dims_list.append(label_dims)
        if label_dims == 1:
            train_y_list.append(train_y_adj[:, indices[i]])
            valid_y_list.append(valid_y_adj[:, indices[i]])
        else:
            train_y_list.append(train_y_adj[:, indices[i]:indices[i + 1]])
            valid_y_list.append(valid_y_adj[:, indices[i]:indices[i + 1]])

        print("Dimensions of label %d are %s" %
              (i, str(train_y_list[-1].shape)))

    ## pass a function to the search that it uses to get a random config
    ## and a function that it will get an eval given (e)pochs and (c)onfig file:
    optim = RandomSearch(
        lambda: get_random_config(weights), lambda e, c: run_one_eval(
            e, c, train_x, train_y_list, valid_x, valid_y_list,
            len(feature_alphabet), output_dims_list, weights))
    best_config = optim.optimize(max_iter=27)

    open(os.path.join(working_dir, 'model_0.config'),
         'w').write(str(best_config))
    print("Best config returned by optimizer is %s" % str(best_config))