Beispiel #1
0
def predict(net, labels, files, params):
    print('starting inference')
    device = torch.device(params.device)
    predictions = []
    probs = []
    for i, file in enumerate(files):
        filename = os.path.splitext(os.path.basename(file))[0]
        processed = filename + '_proc.wav'
        pre.preprocess(file, processed)
        data = vggish_input.wavfile_to_examples(processed)
        data = torch.from_numpy(data).unsqueeze(1).float()
        data = data.to(device)
        net.to(device)
        out = net(data)
        # # for each spectrogram/row index of max probability
        # pred = np.argmax(out.detach().cpu().numpy(), axis=1)
        # # find most frequent index over all spectrograms
        # consensus = np.bincount(pred).argmax()
        # print('file {} sounds like a {} to me'.format(i, labels[consensus]))
        # mean probabilities for each col/class over all spectrograms
        mean_probs = np.mean(out.detach().cpu().numpy(), axis=0)
        # find index of max mean_probs
        idx = np.argmax(mean_probs)
        print('file {} sounds like a {} to me'.format(i, labels[idx]))
        print('my guesses are: ')
        for j, label in enumerate(labels):
            print('{0}: {1:.04f}'.format(label, mean_probs[j]))
        # predictions.append(labels[consensus])
        predictions.append(labels[idx])
        probs.append(mean_probs)
        os.remove(processed)
    return predictions, probs
    def create_multiartifact_sample(self, depthmap):
        depthmaps = np.zeros((240, 180, 5))

        for i, depthmap_path in enumerate(depthmap[0]):
            data, width, height, depth_scale, _max_confidence = preprocessing.load_depth(
                depthmap_path)
            depthmap = preprocessing.prepare_depthmap(data, width, height,
                                                      depth_scale)
            depthmap = preprocessing.preprocess(depthmap)
            depthmaps[:, :, i] = tf.squeeze(depthmap, axis=2)

        depthmaps = tf.stack([depthmaps])
        return depthmaps
Beispiel #3
0
    def process_depthmaps(self):
        depthmaps = []
        for artifact in self.artifacts:
            input_path = self.get_input_path(self.scan_directory,
                                             artifact['file'])

            data, width, height, depthScale, _max_confidence = preprocessing.load_depth(
                input_path)
            depthmap = preprocessing.prepare_depthmap(data, width, height,
                                                      depthScale)
            depthmap = preprocessing.preprocess(depthmap)
            depthmaps.append(depthmap)

        depthmaps = np.array(depthmaps)
        return depthmaps
def main(data, k_folds):
    """Trains a new ML model and uploads it to S3."""
    df = pd.read_csv(data, sep=";")
    df = preprocess(df)

    results = cross_validate_performance(df=df, n=k_folds)
    avg_results = average_evaluation_metrics(results)

    print(f"== Model Evaluation Results (Folds: {k_folds}) ==")
    for metric, value in avg_results.items():
        print(f"{metric.upper()}: {value}")
    print(f"=================================================")

    X, y = Xy_split(df)
    model = WineQualityModel()
    model.fit(X, y)
    model.save(path=S3_PATH)

    return
Beispiel #5
0
def main(
        df_path:
    str = '/project/cq-training-1/project1/data/catalog.helios.public.20100101-20160101.pkl',
        image_size: int = 32,
        model: str = 'dummy',
        epochs: int = 20,
        optimizer: str = 'adam',
        lr: float = 1e-4,
        batch_size: int = 100,
        subset_perc: float = 1,
        subset_dates: bool = False,
        saved_model_dir: str = None,
        seq_len: int = 6,
        seed: bool = True,
        scale_label: bool = True,
        use_csky: bool = False,
        cache: bool = True,
        timesteps_minutes: int = 15):

    # Warning if no GPU detected
    if len(tf.config.list_physical_devices('GPU')) == 0:
        logger.warning('No GPU detected, training will run on CPU.')
    elif len(tf.config.list_physical_devices('GPU')) > 1:
        logger.warning(
            'Multiple GPUs detected, training will run on only one GPU.')

    if subset_dates and subset_perc != 1:
        raise Exception(
            f'Invalid configuration. Argument --subset_dates=True and --subset_perc={subset_perc}.'
        )

    # Set random seed
    if seed:
        tf.random.set_seed(SEED)
        np.random.seed(SEED)

    # Load dataframe
    logger.info('Loading and preprocessing dataframe...')
    df = pd.read_pickle(df_path)
    df = preprocessing.preprocess(df, shuffle=False, scale_label=scale_label)
    metadata = data.Metadata(df, scale_label)

    # Pre-crop data
    logger.info('Getting crops...')
    images = data.Images(metadata, image_size)
    # images.crop(dest=SLURM_TMPDIR)
    images.crop(dest=images.shared_storage)

    # Split into train and valid
    if subset_dates:
        metadata_train, metadata_valid = metadata.split_with_dates()
    else:
        metadata, _ = metadata.split(1 - subset_perc)
        metadata_train, metadata_valid = metadata.split(VALID_PERC)
    nb_train_examples = metadata_train.get_number_of_examples()
    nb_valid_examples = metadata_valid.get_number_of_examples()
    logger.info(
        f'Number of training examples : {nb_train_examples}, number of validation examples : \
                {nb_valid_examples}')

    # Create model
    if model == 'dummy':
        model = baselines.DummyModel()
    elif model == 'sunset':
        model = baselines.SunsetModel()
    elif model == 'cnndem':
        model = baselines.ConvDemModel(image_size)
    elif model == 'sunset3d':
        model = baselines.Sunset3DModel()
    elif model == 'convlstm':
        model = baselines.ConvLSTM()
    elif model == 'cnngru':
        model = CnnGru(seq_len)
    elif model == 'cnngruatt':
        model = CnnGruAtt(seq_len)
    elif model == 'cnnlstm':
        model = LSTM_Resnet(seq_len)
    elif model == 'resnet':
        model = baselines.ResNetModel()
    else:
        raise Exception(f'Model "{model}" not recognized.')

    # Load model weights
    if saved_model_dir is not None:
        model.load_weights(os.path.join(saved_model_dir, "model"))

    # Loss and optimizer
    mse = tf.keras.losses.MeanSquaredError()
    if optimizer == 'adam':
        optimizer = tf.keras.optimizers.Adam(lr)
    elif optimizer == 'sgd':
        optimizer = tf.keras.optimizers.SGD(lr)
    else:
        raise Exception(f'Optimizer "{optimizer}" not recognized.')

    # Create data loader
    dataloader_train = SequenceDataset(
        metadata_train,
        images,
        seq_len,
        batch_size,
        timesteps=datetime.timedelta(minutes=timesteps_minutes),
        cache=cache)
    dataloader_valid = SequenceDataset(
        metadata_valid,
        images,
        seq_len,
        batch_size,
        timesteps=datetime.timedelta(minutes=timesteps_minutes),
        cache=cache)

    # Training loop
    logger.info('Training...')
    losses = {'train': [], 'valid': []}
    best_valid_loss = float('inf')
    for epoch in range(epochs):
        train_epoch(model, dataloader_train, batch_size, mse, optimizer,
                    nb_train_examples, scale_label, use_csky)
        test_epoch(model, dataloader_valid, batch_size, mse, nb_valid_examples,
                   scale_label, use_csky)
        train_loss = np.sqrt(train_mse_metric.result().numpy())
        valid_loss = np.sqrt(valid_mse_metric.result().numpy())
        csky_valid_loss = np.sqrt(valid_csky_mse_metric.result().numpy())

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            utils.save_model(model)

        # Logs
        logger.info(
            f'Epoch {epoch} - Train Loss : {train_loss:.4f}, Valid Loss : {valid_loss:.4f}, Csky Valid Loss : \
                    {csky_valid_loss:.4f}')
        losses['train'].append(train_loss)
        losses['valid'].append(valid_loss)
        with train_summary_writer.as_default():
            tf.summary.scalar('loss', train_loss, step=epoch)
        with test_summary_writer.as_default():
            tf.summary.scalar('loss', valid_loss, step=epoch)

    # Plot losses
    plots.plot_loss(losses['train'], losses['valid'], csky_valid_loss)
Beispiel #6
0
def prepare_dataloader(
    dataframe: pd.DataFrame,
    target_datetimes: typing.List[datetime.datetime],
    stations: typing.Dict[typing.AnyStr, typing.Tuple[float, float, float]],
    target_time_offsets: typing.List[datetime.timedelta],
    config: typing.Dict[typing.AnyStr, typing.Any],
) -> tf.data.Dataset:
    """This function should be modified in order to prepare & return your own data loader.

    Note that you can use either the netCDF or HDF5 data. Each iteration over your data loader should return a
    2-element tuple containing the tensor that should be provided to the model as input, and the target values. 
In
    this specific case, you will not be able to provide the latter since the dataframe contains no GHI, and we 
are
    only interested in predictions, not training. Therefore, you must return a placeholder (or ``None``) as the 
second
    tuple element.

    Reminder: the dataframe contains imagery paths for every possible timestamp requested in 
``target_datetimes``.
    However, we expect that you will use some of the "past" imagery (i.e. imagery at T<=0) for any T in
    ``target_datetimes``, but you should NEVER rely on "future" imagery to generate predictions (for T>0). We
    will be inspecting data loader implementations to ensure this is the case, and those who "cheat" will be
    dramatically penalized.

    See https://github.com/mila-iqia/ift6759/tree/master/projects/project1/evaluation.md for more information.

    Args:
        dataframe: a pandas dataframe that provides the netCDF file path (or HDF5 file path and offset) for all
            relevant timestamp values over the test period.
        target_datetimes: a list of timestamps that your data loader should use to provide imagery for your 
model.
            The ordering of this list is important, as each element corresponds to a sequence of GHI values
            to predict. By definition, the GHI values must be provided for the offsets given by 
``target_time_offsets``
            which are added to each timestamp (T=0) in this datetimes list.
        stations: a map of station names of interest paired with their coordinates (latitude, longitude, 
elevation).
        target_time_offsets: the list of timedeltas to predict GHIs for (by definition: [T=0, T+1h, T+3h, 
T+6h]).
        config: configuration dictionary holding any extra parameters that might be required by the user. These
            parameters are loaded automatically if the user provided a JSON file in their submission. 
Submitting
            such a JSON file is completely optional, and this argument can be ignored if not needed.

    Returns:
        A ``tf.data.Dataset`` object that can be used to produce input tensors for your model. One tensor
        must correspond to one sequence of past imagery data. The tensors must be generated in the order given
        by ``target_sequences``.
    """
    # Things to parse from the json config file
    image_size = config['image_size']
    seq_len = config['seq_len']
    timesteps = datetime.timedelta(minutes=config['timesteps_minutes'])
    scale_label = config['scale_label']

    # Load dataframe
    dataframe = preprocessing.preprocess(dataframe,
                                         shuffle=False,
                                         scale_label=scale_label)
    metadata = data.Metadata(dataframe, scale_label)

    # Build dataloader
    data_loader = EvaluatorDataset(metadata,
                                   image_size=image_size,
                                   seq_len=seq_len,
                                   timesteps=timesteps,
                                   target_datetimes=target_datetimes,
                                   stations=stations,
                                   target_time_offsets=target_time_offsets)

    return data_loader
def train():
    discount_factor = 0.99
    num_episodes = 100000000
    exploration_rate_begin = 1
    exploration_rate_end = 0.1
    exploration_rate = exploration_rate_begin
    exploration_decay = 100000
    render = False
    render_freq = 30
    steps_done = 0
    replay_size_start = 10000

    batch_size = 64
    update_model_freq = 16
    update_target_freq = 1000
    lr = 0.00025
    momentum = 0.95

    # init objects
    buffer = Experience_buffer()
    env = gym.make('FlappyBird-v0')
    model = Model(env.action_space.n)
    # model.apply(Model.weights_init)
    target = Model(env.action_space.n)
    optimizer = optim.RMSprop(params=model.parameters(), lr=lr, momentum=momentum)
    loss = nn.SmoothL1Loss()
    agent = DQNAgent(env, model, target, optimizer, loss, update_target_freq)

    # let's play
    for i in range(num_episodes):

        # start a new episode
        print('Episode #{}'.format(i))
        done = False
        episode_reward = 0
        current_loss = 0
        current_obs = env.reset()
        current_obs = preprocess(current_obs)
        # current_obs = Variable(torch.from_numpy(current_obs).unsqueezed(0), volatile=True)

        if render:
            env.render()

        while not done:
            action = agent.select_action(current_obs, exploration_rate)
            next_obs, reward, done, _ = env.step(action)
            next_obs = preprocess(next_obs)

            if render:
                env.render()
        
            
            buffer.add_experience(current_obs, action, reward, next_obs, done)
            
            # update data
            current_obs = next_obs
            episode_reward += reward
            steps_done += 1
            if steps_done > replay_size_start:
                exploration_rate = exploration_rate_end + (exploration_rate_begin - exploration_rate_end) * math.exp(-1. * steps_done / exploration_decay)

            # if the buffer is filled enough, periodically update the model
            if len(buffer) > batch_size and steps_done % update_model_freq == 0 and steps_done > replay_size_start:
                print('INFO: agent updating...')
                batch = buffer.sample(batch_size)
                current_loss = agent.update(batch, i, discount_factor)
                if i % update_target_freq == 0:
                    agent.update_target()
        
        if (i + 1 + render_freq) % render_freq == 0:
            render = True
        else:
            render = False
        
        print('Episode #{} reward:'.format(i), episode_reward)
        print('Current loss:'.format(i), current_loss)
        print()
Beispiel #8
0
    # Model directory
    model_dir = args.result_dir + '/models/' + args.model + '/' + args.dataset + '/' + args.loss + '/BS_' + str(
        args.batch_size) + '/CI_' + str(args.critic_iter) + '/ND_' + str(
            args.noise_dim) + '/L_' + str(args.LAMBDA)

    # Load and prepare data
    if args.dataset.lower() == 'stress_strain':
        dataset = stress_strain.StressStrainDS()
        preproc = preprocessing.standardize
    elif args.dataset.lower() == 'mnist':
        dataset = mnist.MNISTDS()
        preproc = preprocessing.standardize_MNIST

    train_dataset = dataset.load_dataset()
    train_dataset, scaler = preprocessing.preprocess(train_dataset,
                                                     args.batch_size, preproc)

    INPUT_SHAPE = tuple(
        tf.compat.v1.data.get_output_shapes(train_dataset).as_list()[1:])

    # Instantiate Generator and Discriminator
    generator, discriminator = gans.get_models(args.model, args.loss,
                                               INPUT_SHAPE, args.noise_dim)

    print('\n\n######### GENERATOR #########\n')
    generator.summary()
    print('\n\n####### DISCRIMINATOR #######\n')
    discriminator.summary()

    # Optimizers
    if args.optimizer.lower() == 'adam':
Beispiel #9
0
def train():

    ctx = [mx.cpu(0), mx.cpu(1), mx.cpu(2), mx.cpu(3)]

    discount_factor = 0.9
    num_episodes = 10000
    exploration_rate_begin = 0.9
    exploration_rate_end = 0.05
    exploration_rate = exploration_rate_begin
    exploration_decay = 200
    render = False
    steps_done = 0

    batch_size = 64
    update_freq = 5
    lr = 0.01

    # init objects
    buffer = Experience_buffer()

    env = gym.make('FlappyBird-v0')

    model = Model(env.action_space.n)
    model.initialize(init=mx.initializer.Xavier(), ctx=ctx)

    optimizer = mx.optimizer.Adam(learning_rate=lr)
    trainer = gluon.Trainer(params=model.collect_params(), optimizer=optimizer)
    loss = gluon.loss.HuberLoss()

    agent = DQNAgent(env, model, trainer, loss)

    # let's play !
    for i in range(num_episodes):

        # begin a new episode
        print('Episode #{}'.format(i))
        done = False
        episode_reward = 0
        current_loss = 0

        current_obs = env.reset()
        current_obs = nd.array(preprocess(current_obs))

        if render:
            env.render()

        while not done:
            action = agent.select_action(current_obs, exploration_rate)
            next_obs, reward, done, _ = env.step(action)
            next_obs = nd.array(preprocess(next_obs))

            if render:
                env.render()

            buffer.add_experience(current_obs, action, reward, next_obs, done)

            if buffer.is_full():
                print('INFO: buffer is full')

            # update information
            current_obs = next_obs
            episode_reward += reward
            steps_done += 1
            exploration_rate = exploration_rate_end + (
                exploration_rate_begin - exploration_rate_end) * math.exp(
                    -1. * steps_done / exploration_decay)

            # if the buffer is filled enough, periodically update the model
            if len(buffer) > batch_size and update_freq % i == 0:
                print('INFO: agent updating...')
                batch = buffer.sample(batch_size)
                current_loss = agent.update(batch, batch_size, i,
                                            discount_factor)
                render = True
            else:
                render = False

        print('Episode #{} reward:'.format(i), episode_reward)
        print('Current loss:'.format(i), current_loss)
Beispiel #10
0
  def __read_data__(self):
    scaler = StandardScaler()

    print('loading dataset...')
    #  print(self.root_path)
    #  print(self.data_name)
    data_path = Path(self.root_path)/self.data_name
    pickle_path = Path(self.root_path)/f"{self.data_name}.pandas.pickle"
    #  print(data_path)

    if not pickle_path.exists():
      with pickle_path.open('wb') as p_fd:
        df_raw = pd.read_csv(data_path)
        features = [c for c in df_raw.columns if 'feature' in c]
        print('preprocessing data...')
        df_raw = preprocess(df_raw, self.scale)
        pickle.dump(df_raw, p_fd)

    with pickle_path.open('rb') as p_fd:
      df_pickled = pickle.load(p_fd)

    #  df_pickled.info()

    df_pickled = df_pickled[df_pickled.weight != 0]

    #  df_pickled = df_pickled[df_pickled.date > 399]

    df_pickled = df_pickled[df_pickled.date > 85].reset_index(drop=True)

    print('generate target...')
    resp_cols = [c for c in df_pickled.columns if 'resp' in c]

    #  df_pickled['action'] = ((df_pickled['resp'] > 0) &
    #                          (df_pickled['resp_1'] > 0) &
    #                          (df_pickled['resp_2'] > 0) &
    #                          (df_pickled['resp_3'] > 0) &
    #                          (df_pickled['resp_4'] > 0)).astype('int')
    #  df_pickled['action'] = df_pickled['resp'].copy()
    df_pickled['action'] = df_pickled[resp_cols].sum(axis=1)/len(resp_cols)
    #  df_pickled['action'] = df_pickled.apply(lambda row: row.weight * row.resp, axis='columns')

    #  df_pickled['action_1'] = df_pickled['resp_1']
    #  df_pickled['action_2'] = df_pickled['resp_2']
    #  df_pickled['action_3'] = df_pickled['resp_3']
    #  df_pickled['action_4'] = df_pickled['resp_4']

    #  df_pickled.info()

    print("split train, valid...")
    split_date = 400
    train_df = df_pickled.loc[df_pickled.date <= split_date].reset_index(drop=True)
    valid_df = df_pickled.loc[df_pickled.date > split_date].reset_index(drop=True)

    #  print(train_df)
    #  valid_df['weighted_resp'] = valid_df.apply(lambda row: row.weight * row.resp, axis='columns')
    #  target_cols = ['action', 'action_1', 'action_2', 'action_3', 'action_4']
    #  target_cols = ['action', 'action_1', 'action_2', 'action_3']
    #  target_cols = ['weighted_resp']
    target_cols = ['action']

    if self.scale:
        train_df[target_cols] = scaler.fit_transform(train_df[target_cols].values)
        valid_df[target_cols] = scaler.fit_transform(valid_df[target_cols].values)


    print('organize values...')


    features = [c for c in train_df.columns if 'feature' in c]
    if self.set_type == 0:
      self.weight = train_df.weight.values
      self.resp   = train_df.resp.values
      self.data_x = train_df[features+target_cols].values
      self.data_y = train_df[features+target_cols].values
      self.data_stamp = train_df.date.values
    elif self.set_type == 1:
      self.weight = valid_df.weight.values
      self.resp   = valid_df.resp.values
      self.data_x = valid_df[features+target_cols].values
      self.data_y = valid_df[features+target_cols].values
      self.data_stamp = valid_df.date.values
Beispiel #11
0
import numpy as np
from sklearn.model_selection import train_test_split

import utils.config as config
from utils.metrics import show_metrics, show_conf_matrix, dtree_viz
from utils.models import DecisionTree, RandomForest, KNeighbours, ArtificialNeuralNetwork, XGBoost, \
    Results, VotingClassifier
from utils.preprocessing import preprocess

# Eliminate randomness
np.random.seed(1337)
random.seed(1337)

# Take sequence file as a BoW of bytes and make a probability distribution from that
X1, X2, y = preprocess()

# Cast lists to numpy array
X1 = np.array(X1)
X2 = np.array(X2)
y = np.array(y)

# Split train and test data
X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split(
    X1, X2, y, test_size=config.TEST_SIZE)

# Make models and train
models_probability = {
    'dt': DecisionTree,
    'rf': RandomForest,
    'knn': KNeighbours,