Ejemplo n.º 1
0
def run_search(search_param, args, params):
    """Train a model and evaluate the model for a set of parameters during parameter search"""
    # Define unique job name
    job_name = "{}_{}".format(args.hyperparameter, search_param)

    # Set the logger
    set_logger(os.path.join(args.parent_dir, job_name + '.log'))

    # Train the model
    corpus, dictionary, _ = process_data(path=args.train_path,
                                         params=params,
                                         dictionary=None)
    lda = train_lda(corpus, params, dictionary)

    # Save results
    log_results(lda, params, corpus, dictionary, True, args)

    if not args.test_mode:
        # Evaluate the model on dev set
        eval_results(dev_path=args.dev_path,
                     lda=lda,
                     dictionary=dictionary,
                     params=params)
    else:
        # Evaluate model in the test set
        eval_results(dev_path='data/processed/test/test_200.csv',
                     lda=lda,
                     dictionary=dictionary,
                     params=params)
Ejemplo n.º 2
0
def main():

    # Load the parameters from json file
    args = parser.parse_args()
    json_path = os.path.join(args.model_dir, 'params.json')
    params = Params(json_path)

    # Set the logger 
    set_logger(os.path.join(args.model_dir, 'train.log'))

    # Create the input data pipeline
    logging.info('Creating the dataset...')
    data_dir = args.data_dir
    valid_data_dir = os.path.join(data_dir, 'valid')
    
    # Get the filenames and labels from the test set
    valid_filenames, valid_labels = get_filenames_and_labels(
        valid_data_dir, params)

    params.valid_size = len(valid_filenames)
    params.num_labels = len(set(valid_labels))

    # Create the two iterators over the two datasets
    valid_inputs = input_fn(False, valid_filenames,
                            valid_labels, params)

    # Define the model
    logging.info("Creating the model...")
    model_spec = model_fn('eval', valid_inputs, params,
                          reuse=False)

    logging.info("Starting evaluation")
    evaluate(model_spec, args.model_dir, params,
             args.restore_from)
Ejemplo n.º 3
0
    def __init__(self):
        # Load the parameters
        args = EvaluatePointConfig()
        json_path = os.path.join(args.model_dir, 'params.json')
        assert os.path.isfile(
            json_path), "No json configuration file found at {}".format(
                json_path)
        params = Params(json_path)
        if params.mlp_sizes is None or len(params.mlp_sizes) == 0:
            logging.error(
                'mlp_sizes are not set correctly, at least one MLP layer is required'
            )
        params.dict['loss_fn'] = args.loss_fn

        # Load the parameters from the dataset, that gives the size etc. into params
        json_path = os.path.join(args.data_dir, 'dataset_params.json')
        assert os.path.isfile(
            json_path), "No json file found at {}, run build.py".format(
                json_path)
        params.update(json_path)
        # Set the logger
        set_logger(os.path.join(args.model_dir, 'evaluate.log'))
        # # Get paths for tfrecords
        path_eval_tfrecords = os.path.join(args.data_dir,
                                           'test_' + args.tfrecords_filename)
        # Create the input data pipeline
        logging.info("Creating the dataset...")
        eval_dataset = load_dataset_from_tfrecords(path_eval_tfrecords)
        # Create iterator over the test set
        # eval_inputs = input_fn('test', eval_dataset, params)
        eval_inputs = online_input_fn()
        logging.info("- done.")
        # print(type(eval_inputs))

        # Define the model
        logging.info("Creating the model...")
        weak_learner_id = load_best_ndcgs(
            os.path.join(args.model_dir, args.restore_from, 'learner.json'))[0]
        self.model_spec = model_fn('test',
                                   eval_inputs,
                                   params,
                                   reuse=False,
                                   weak_learner_id=int(weak_learner_id))
        # node_names = [n.name for n in tf.get_default_graph().as_graph_def().node]
        # print(node_names)
        logging.info("- done.")
        logging.info("Starting evaluation")
        logging.info("Optimized using {} learners".format(weak_learner_id))
        self.saver = tf.train.Saver()
        self.sess = tf.Session()
        self.params = params
        self.sess.run(self.model_spec['variable_init_op'])
        save_path = os.path.join(args.model_dir, args.restore_from)
        if os.path.isdir(save_path):
            save_path = tf.train.latest_checkpoint(save_path)
        self.saver.restore(self.sess, save_path)
Ejemplo n.º 4
0
def main():

    # Load the parameters from json file
    args = parser.parse_args()
    json_path = os.path.join(args.model_dir, 'params.json')
    assert os.path.isfile(json_path), 'No json configuration file found at {}'.format(json_path)
    params = Params(json_path)

    # Set the logger 
    set_logger(os.path.join(args.model_dir, 'train.log'))
    
    if not os.path.exists(args.restore_from):
        os.makedirs(args.restore_from)

    # Create the input data pipeline
    logging.info('Creating the datasets...')
    data_dir = args.data_dir
    train_data_dir = os.path.join(data_dir, 'train')
    valid_data_dir = os.path.join(data_dir, 'valid')

    # Get the filenames and labels from the train and valid sets
    train_filenames, train_labels = get_filenames_and_labels(
        train_data_dir, params)
    valid_filenames, valid_labels = get_filenames_and_labels(
        valid_data_dir, params)

    params.train_size = len(train_filenames)
    params.valid_size = len(valid_filenames)
    params.num_labels = len(set(train_labels))

    # Create the two iterators over the two datasets
    train_inputs = input_fn(True, train_filenames,
                            train_labels, params)
    valid_inputs = input_fn(False, valid_filenames,
                            valid_labels, params)

    # Define the model
    logging.info('Creating the model...')
    train_model_spec = model_fn('train', train_inputs,
                                params)
    valid_model_spec = model_fn('eval', valid_inputs,
                                params, reuse=True)
    # Train the model
    logging.info('Starting training for {} epoch(s)'.format(
        params.num_epochs))
    train_and_evaluate(train_model_spec, valid_model_spec,
                       args.model_dir, params, args.restore_from)
Ejemplo n.º 5
0
def train():
    # Set the logger
    set_logger(os.path.join(params['model_dir'], 'train.log'))
    # log params
    logging.info(params)

    # Load vacabulary
    vocab = tf.contrib.lookup.index_table_from_file(vocab_path,
                                                    num_oov_buckets=1)

    # Create the input data pipeline
    logging.info('Creating the datasets...')
    train_input_words = load_dataset_from_text(data_dir, train_input_filename,
                                               vocab)
    train_context_words = load_dataset_from_text(data_dir,
                                                 train_context_filename, vocab)

    # Create the iterator over the dataset
    train_inputs = input_fn('train', train_input_words, train_context_words,
                            params)
    eval_inputs = input_fn('eval', train_input_words, train_context_words,
                           params)
    logging.info("- done")

    # Define the model
    logging.info('Creating the model...')
    train_model_spec = model_fn('train',
                                train_inputs,
                                params,
                                reuse=tf.AUTO_REUSE)
    eval_model_spec = model_fn('eval', eval_inputs, params, reuse=True)
    logging.info('- done.')

    # Train the model
    logging.info('Starting training for {} epochs'.format(
        params['num_epochs']))
    normalized_embedding_matrix = train_and_evaluate(train_model_spec,
                                                     eval_model_spec, params)

    save_dict_to_json(params, params['model_dir'] + '/params.json')
    pd.DataFrame(normalized_embedding_matrix).to_csv(os.path.join(
        params['model_dir'], 'normalized_embedding_matrix.tsv'),
                                                     index=False,
                                                     header=None,
                                                     sep='\t')
Ejemplo n.º 6
0
def funct(x):
    # Set the random seed for the whole graph

    tf.set_random_seed(230)

    # Load the parameters
    args = parser.parse_args()
    json_path = os.path.join(args.model_dir, 'params.json')
    assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path)
    params = Params(json_path)



    # Set the logger
    set_logger(os.path.join(args.data_dir, 'predict.log'))

    # Create the input data pipeline

    data_dir = args.data_dir
    test_data_dir = os.path.join(data_dir)

    # Get the filenames from the test set

    test_filenames = [os.path.join(test_data_dir, 'predict.jpg') ]

    test_labels = [x]
    # print(test_labels)

    # specify the size of the evaluation set
    params.eval_size = len(test_filenames)

    # create the iterator over the dataset
    test_inputs = input_fn(False, test_filenames, test_labels, params)

    # Define the model

    model_spec = model_fn('eval', test_inputs, params, reuse=tf.AUTO_REUSE)


    evaluate(model_spec, args.model_dir, params, args.restore_from)
Ejemplo n.º 7
0
def main():
    # Load parameters from json file
    args = parser.parse_args()
    json_path = os.path.join(args.model_dir, 'params.json')
    assert os.path.isfile(
        json_path), "No json configuration file found at {}".format(json_path)
    params = Params(json_path)

    # use GPU if found
    params.cuda = torch.cuda.is_available()
    if params.cuda:
        params.device = torch.device('cuda:0')
    else:
        params.device = torch.device('cpu')

    # Set a seed for reproducible experiments
    torch.manual_seed(141)
    if params.cuda:
        torch.cuda.manual_seed(141)

    # Set the training logger for updates
    set_logger(os.path.join(args.model_dir, 'train.log'))

    logging.info("Creating input pipelines...")

    data_pipelines = fetch_pipeline(['train', 'validation'], args.data_dir,
                                    params)
    train_pipeline = data_pipelines['train']
    logging.info("Completed (Training Dataset)!")
    valid_pipeline = data_pipelines['validation']
    logging.info("Completed (Validation Dataset)!")

    logging.info("Building network model...")
    model_spec = model_fn(params)
    logging.info("Building completed!")

    logging.info("Initiate training procedure!")
    train_and_validate(model_spec, train_pipeline, valid_pipeline,
                       args.model_dir, params, args.restore_from)
    logging.info("Training completed!")
Ejemplo n.º 8
0
    def __init__(self, params):
        logging.info("Initializing dataset ...")
        self.dataset_path = params.dataset_path
        self.params = params

        if not os.path.isdir(params.experiment_path):
            os.mkdir(params.experiment_path)
        tf.set_random_seed(100)
        set_logger(os.path.join(params.experiment_path, 'experiment.log'))

        #Get the file paths for data
        self.get_data_path()

        #Split the train data and verify all data contents
        self.read_and_verify_data()

        #Build tf datasets for each set of inputs
        self.train_dataset = self.build_tf_dataset(self.train_filenames,
                                                   self.train_labels,
                                                   is_training=True)
        self.eval_dataset = self.build_tf_dataset(self.eval_filenames,
                                                  self.eval_labels,
                                                  is_training=False)
        self.test_dataset = self.build_tf_dataset(self.test_filenames,
                                                  self.test_labels,
                                                  is_training=False)

        #Build singular dataset iterator
        self.dataset_iterator = tf.data.Iterator.from_structure(
            self.train_dataset.output_types, self.train_dataset.output_shapes)
        self.inputs, self.labels, self.is_training = self.dataset_iterator.get_next(
        )

        #Build init ops for train, eval, and test datasets
        self.train_init_op = self.dataset_iterator.make_initializer(
            self.train_dataset)
        self.eval_init_op = self.dataset_iterator.make_initializer(
            self.eval_dataset)
        self.test_init_op = self.dataset_iterator.make_initializer(
            self.eval_dataset)
Ejemplo n.º 9
0
    args = parser.parse_args()
    json_path = os.path.join(args.model_dir, 'params.json')
    assert os.path.isfile(
        json_path), "No json configuration file found at {}".format(json_path)
    params = Params(json_path)
    params.dict['loss_fn'] = args.loss_fn

    # # Load the parameters from the dataset, that gives the size etc. into params
    json_path = os.path.join(args.data_dir, 'dataset_params.json')
    assert os.path.isfile(
        json_path), "No json file found at {}, run prepare_data.py".format(
            json_path)
    params.update(json_path)

    # Set the logger
    set_logger(os.path.join(args.model_dir, 'train.log'))

    path_train_tfrecords = os.path.join(args.data_dir,
                                        'train_' + args.tfrecords_filename)
    path_eval_tfrecords = os.path.join(args.data_dir,
                                       'eval_' + args.tfrecords_filename)

    # Create the input data pipeline
    logging.info("Creating the datasets...")
    train_dataset = load_dataset_from_tfrecords(path_train_tfrecords)
    eval_dataset = load_dataset_from_tfrecords(path_eval_tfrecords)

    # Specify other parameters for the dataset and the model

    # Create the two iterators over the two datasets
    train_inputs = input_fn('train', train_dataset, params)
 # Load the parameters
 args = parser.parse_args()
 json_path = os.path.join(args.model_dir, 'params.json')
 assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path)
 params = Params(json_path)
 params.dict['loss_fn'] = args.loss_fn
 params.dict['collect'] = False
 params.dict['use_kfac'] = args.use_kfac
 params.dict['finetune'] = args.finetune    
 params.dict['training_keep_prob'] = 1.0
 # Load the parameters from the dataset, that gives the size etc. into params
 json_path = os.path.join(args.data_dir, 'dataset_params.json')
 assert os.path.isfile(json_path), "No json file found at {}, run build.py".format(json_path)
 params.update(json_path)
 # Set the logger
 set_logger(os.path.join(args.model_dir, 'test{}.log'.format(args.log)))
 # # Get paths for tfrecords
 dataset = 'test'
 path_eval_tfrecords = os.path.join(args.data_dir, dataset + args.tfrecords_filename)
 # Create the input data pipeline
 logging.info("Creating the dataset...")
 eval_dataset = load_dataset_from_tfrecords(path_eval_tfrecords)
 # Create iterator over the test set
 eval_inputs = input_fn('test', eval_dataset, params)
 logging.info("- done.")
 # Define the model
 logging.info("Creating the model...")
 # weak_learner_id = load_learner_id(os.path.join(args.model_dir, args.restore_from, 'learner.json'))[0]
 eval_model_spec = model_fn('test', eval_inputs, params, reuse=False)
 # node_names = [n.name for n in tf.get_default_graph().as_graph_def().node]
 # print(node_names)
Ejemplo n.º 11
0
import boto3
import pickle
import cv2
from model.opts import configure_args
from model.utils import set_logger, pre_process
import numpy as np
from model.network_architecture import create_model
import tensorflow as tf

if __name__ == "__main__":
    user_id = 4642
    args = configure_args()
    set_logger('output/train_{}.log'.format(args.name))
    s3 = boto3.client('s3')
    response = s3.get_object(Bucket='cureskin-dataset',
                             Key='new_data/image_{}.pkl'.format(user_id))
    body = response['Body'].read()
    img_frame = pickle.loads(body)

    x, mask = pre_process(img_frame, args)
    print(mask)

    checkpoint_path = 'ckpts'.format(args.name) + '/cp-0005.ckpt'
    model = create_model(args)
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  optimizer=tf.keras.optimizers.SGD(),
                  metrics=['accuracy',
                           tf.keras.metrics.Precision()])
    model.load_weights(checkpoint_path)

    prob = model.predict_on_batch([x, mask])
Ejemplo n.º 12
0
    args = parser.parse_args()
    tf.set_random_seed(233)

    json_path = os.path.join(args.model_dir, 'params.json')
    assert os.path.isfile(
        json_path), "could't find the json configuration file at {}".format(
            json_path)
    params = Params(json_path)

    model_dir_has_best_weights = os.path.isdir(
        os.path.join(args.model_dir, "best_weight"))
    overwritting = model_dir_has_best_weights and args.restore_from is None
    assert not overwritting, "weights found in model_dir, aborting to avoid overwrite"

    # Set the logger
    set_logger(os.path.join("../", 'train.log'))

    # Create the input data pipeline
    logging.info("Creating the datasets...")
    data_dir = args.data_dir
    ground_truth_dir = args.ground_truth_dir
    train_data_dir = os.path.join(data_dir, "train")
    dev_data_dir = os.path.join(data_dir, "dev")
    train_masks_dir = os.path.join(ground_truth_dir, "train")
    dev_masks_dir = os.path.join(ground_truth_dir, "dev")

    # Get the filenames from the train and dev sets
    train_filenames = [
        os.path.join(train_data_dir, f) for f in os.listdir(train_data_dir)
        if f.endswith('.jpg')
    ]
Ejemplo n.º 13
0
    help = "Whether to download MS or not")
parser.add_argument('--scrap_InChi', default= True,\
    help = "Whether to download InChi or not")

args = parser.parse_args()

#Check if file containing CAS ids exist
assert os.path.isfile(args.cas_list), "No file named {} exists".format(
    args.cas_list)

#Create data directory to store logs and spectra
data_dir = args.save_dir
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

set_logger(data_dir, 'scrap.log')

#Obtain CAS ids used for downloading the content from NIST
logging.info('Loading CAS file')
cas_df = pd.read_csv(args.cas_list,
                     sep='\t',
                     names=['name', 'formula', 'cas'],
                     header=0)
cas_df.dropna(subset=['cas'], inplace=True)
cas_df.cas = cas_df.cas.str.replace('-', '')

cas_ids = list(cas_df.cas)

logging.info('Scrap Mass spectra')
if args.scrap_MS:
    params = params = {'JCAMP': '', 'Index': 0, 'Type': 'Mass'}
                                                                .keys())


if __name__ == '__main__':
    #Parsing the data from jdx and storing it in csv

    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', default= './data',\
        help = "Directory path containing scrapped data")
    parser.add_argument('--cas_list', default= 'species.txt',\
        help = "File containing CAS number and smiles of molecules")

    args = parser.parse_args()

    data_dir = args.data_dir
    set_logger(data_dir, 'prepare_data.log')

    # Create bins for IR and mass spectra
    logging.info('Creating bins for standardizing the spectra')
    ir_bins = np.arange(min_ir - eps, max_ir + eps, step_ir)
    mass_bins = np.arange(min_mass - eps, max_mass + eps, step_mass)

    # Compute structures of different molecular groups
    logging.info('Computing the structures of functional groups')
    func_grp_structs = {func_name : Chem.MolFromSmarts(func_smarts)\
                        for func_name, func_smarts in func_grp_smarts.items()}

    # Create and save csv files of spectra
    for root, dirs, files in os.walk(data_dir):
        if root == os.path.join(data_dir, 'ir'):
            logging.info('Starting to parse IR jdx files')
Ejemplo n.º 15
0
        return 1
    else:
        neg_count += 1
        return 0


if __name__ == "__main__":
    args = configure_args()

    if not os.path.exists('output'):
        os.makedirs('output')

    if not os.path.exists('data'):
        os.makedirs('output')

    set_logger('output/train.log')

    fs = s3fs.S3FileSystem()

    bucket_name = 'cureskin-dataset'
    data_key = 'dr_msg_stats.csv'
    data_location = 's3://{}/{}'.format(bucket_name, data_key)

    df_stats = pd.read_csv(data_location)

    s3 = boto3.resource('s3')
    bucket = s3.Bucket('cureskin-dataset')
    df = pd.DataFrame()
    # extract users with doctor's notes
    for user in df_stats['user_id'].unique()[:args.data_size]:
        data_location = 's3://cureskin-dataset/followup_data/user_{0:012}.json'.format(
Ejemplo n.º 16
0
    assert os.path.isfile(
        json_path), "No json configuration file found at {}".format(json_path)
    params = Params(json_path)

    # Set the random seed for the whole graph
    tf.set_random_seed(params.seed)

    # Load the parameters from the dataset, that gives the size etc. into params
    json_path = os.path.join(args.data_dir, 'dataset_params.json')
    assert os.path.isfile(
        json_path), "No json file found at {}, run build.py".format(json_path)
    params.update(json_path)
    num_oov_buckets = params.num_oov_buckets  # number of buckets for unknown words

    # Set the logger
    set_logger(os.path.join(args.model_dir, 'evaluate.log'))

    # Get paths for vocabularies and dataset
    path_vocab = os.path.join(args.data_dir, 'vocab{}'.format(params.min_freq))
    params.vocab_path = path_vocab
    path_test_queries = os.path.join(args.data_dir, 'dev/queries.txt')
    path_test_articles = os.path.join(args.data_dir, 'dev/articles.txt')
    # Load Vocabularies
    vocab = tf.contrib.lookup.index_table_from_file(
        path_vocab, num_oov_buckets=num_oov_buckets, key_column_index=0)

    # Create the input data pipeline
    logging.info("Creating the dataset...")
    test_queries = load_dataset_from_text(path_test_queries, vocab, params)
    test_articles = load_dataset_from_text(path_test_articles, vocab, params)
Ejemplo n.º 17
0
    help = "Directory path containing IR and MS spectra data")
parser.add_argument('--restore_ae_from', default= None,\
    help = "Restore AE weights before training the model")
parser.add_argument('--restore_mlp_from', default= None,\
    help = "Restore MLP weights before training the model")

args = parser.parse_args()

#Model directory should contain params.json file listing all hyperparameters
json_path = os.path.join(args.model_dir, 'params.json')
assert os.path.isfile(json_path),"No params.json found at {} path".format(args.model_dir)

with open(json_path) as json_data:
    params = json.load(json_data)

set_logger(args.model_dir, 'train.log')

logging.info('Load the dataset from {}'.format(args.data_dir))
X, y, func_names = load_dataset(args.data_dir, True, **params['preprocess'])


#Train and test generator for every fold
data_generator = train_test_generator(X, y, params['n_splits'])

train_predictions = []
test_predictions = []

for cv, (train_data, test_data) in enumerate(data_generator):
    logging.info('Starting fold {}'.format(cv+1))
    train_size = train_data[0].shape[0]
    eval_size = test_data[0].shape[0]
Ejemplo n.º 18
0
                                                               y_pred=y_)


def grad(model, inputs, targets):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets, training=True)
    return loss_value, tape.gradient(loss_value, model.trainable_variables)


if __name__ == '__main__':
    # Set the random seed for the whole graph for reproductible experiments
    tf.random.set_seed(123)

    # Set the logger
    cwd = os.getcwd()
    set_logger(os.path.join(cwd, 'train.log'))

    # Create the input data pipeline
    logging.info("Creating the datasets...")

    # For shorter training time, We'll use caltech101 instead of imagenet used in the paper
    data_dir = pathlib.Path(r'C:\Users\K\tensorflow_datasets\caltech101')

    batch_size = 32
    img_height = 256
    img_width = 256

    train_ds = tf.keras.preprocessing.image_dataset_from_directory(
        data_dir,
        label_mode='categorical',
        validation_split=0.2,