Ejemplo n.º 1
0
def convert_test_data(filename, alarm, ticket):
    '''
    Function converts the encoded hex values to numpy arrays.

            Returns: converted data in form of numpy array.

            Parameters:
                filename: prediction filename
                alarm: alarm file required by DataProcessor object
                ticket: ticket file required by DataProcessor object
    '''
    dp = DataProcessor(alarm, ticket)
    test_alarm_values = []
    with open(filename, encoding='utf8') as csv_file:
        reader = csv.reader(csv_file)
        next(reader)
        for row in reader:
            test_alarm_values.append(row[3])

    result_list = []
    for item in test_alarm_values:
        encoded_alarm = dp.encode_hex_values(item)
        result_list.append(encoded_alarm)

    return dp.convert_array_to_np_array(result_list)
Ejemplo n.º 2
0
def train(raw, flags):
    data_processor = DataProcessor(flags.forecast_length, flags.batch_size,
                                   flags.window)
    train_loader, val_loader = data_processor.get_train_test_data(
        raw, flags.validation_ratio)

    model = DeepAR(cov_dim=data_processor.num_features,
                   hidden_dim=flags.num_units,
                   num_layers=flags.num_layers,
                   num_class=len(raw['type'].unique()),
                   embedding_dim=flags.embedding_size,
                   batch_first=True,
                   dropout=flags.dropout)

    opt = torch.optim.Adam(model.parameters(), lr=flags.learning_rate)

    teacher_ratio = flags.teacher_ratio
    loss_history = []
    loss_fn = gaussian_likelihood_loss

    model, opt, start_epoch = load_checkpoint(flags.checkpoint_path, model,
                                              opt)
    if start_epoch >= flags.num_epochs:
        print('start_epoch is larger than num_epochs!')
    epoch = start_epoch
    # TODO: add early stop
    for epoch in range(start_epoch, flags.num_epochs):
        for step, data in enumerate(train_loader):
            avg_loss, _ = _forward(data, model, loss_fn, flags.window,
                                   flags.forecast_length, teacher_ratio)
            loss_history.append(avg_loss)
            opt.zero_grad()
            avg_loss.backward()
            opt.step()
            teacher_ratio *= flags.teacher_ratio_decay
        validation_loss = evaluate(val_loader, model, loss_fn, flags.window,
                                   flags.forecast_length)
        print('Epoch: %d' % epoch)
        print("Training Loss:%.3f" % avg_loss)
        print("Validation Loss:%.3f" % validation_loss)
        print('Teacher_ratio: %.3f' % teacher_ratio)
        print()

    print('Model training completed and save at %s' % flags.checkpoint_path)
    state = {
        'epoch': epoch + 1,
        'state_dict': model.state_dict(),
        'optimizer': opt.state_dict()
    }
    if not os.path.exists(flags.checkpoint_path):
        os.mkdir(flags.checkpoint_path)
    torch.save(state, flags.checkpoint_path + '/model.pt')
    data_processor.save(flags.checkpoint_path)
    return model, loss_history
Ejemplo n.º 3
0
 def __init__(self, ip):
     self.log = Logger(MAIN_CLIENT_LOG_FILE, D_VERB)
     self.log.info('[MAIN THREAD] Instantiated client')
     self.receiving = False
     self.define_headers()
     self.targets = {}
     self.transmit = Queue.Queue()
     self.data_client = DataClient(self.transmit, ip)
     self.data_processor = DataProcessor(self.transmit, self.headers,
                                         self.targets)
     self.connect(ip)
Ejemplo n.º 4
0
def train(raw, flags):
    data_processor = DataProcessor(flags.forecast_length, flags.batch_size, flags.window)
    train_loader, val_loader = data_processor.get_train_test_data(raw,
                                                                  flags.validation_ratio)

    encoder = EncoderRNN(data_processor.num_features + 1, flags.num_units)
    decoder = DecoderRNN(data_processor.num_features + 1, flags.num_units,
                         output_size=flags.output_dim, batch_first=True)

    def init_weights(m):
        for name, param in m.named_parameters():
            torch.nn.init.uniform_(param.data, -0.1, 0.1)

    encoder.apply(init_weights)
    decoder.apply(init_weights)

    loss_fn = torch.nn.MSELoss()
    # loss_fn = SMAPE()
    model_params = list(encoder.parameters()) + list(decoder.parameters())

    opt = torch.optim.Adam(model_params, lr=flags.learning_rate)

    teacher_ratio = flags.teacher_ratio
    loss_history = []

    # model, opt, start_epoch = load_checkpoint(flags.checkpoint_path, model, opt)
    # if start_epoch >= flags.num_epochs:
    #     print('start_epoch is larger than num_epochs!')
    start_epoch = 0
    epoch = start_epoch
    # TODO: add early stop
    for epoch in range(start_epoch, flags.num_epochs):
        for step, data in enumerate(train_loader):
            avg_loss, _, acc = _forward(data, [encoder, decoder], loss_fn, flags.window,
                                        flags.forecast_length, True, teacher_ratio=teacher_ratio)
            loss_history.append(avg_loss)
            opt.zero_grad()
            avg_loss.backward()
            torch.nn.utils.clip_grad_norm_(model_params, 5.0)
            opt.step()
            teacher_ratio *= flags.teacher_ratio_decay
        val_loss, val_acc = evaluate(val_loader, [encoder, decoder], loss_fn, flags.window,
                                     flags.forecast_length)
        print('Epoch: %d' % epoch)
        print("Training Loss:%.3f" % avg_loss)
        print('Training Avg Accuracy:%.3f' % acc)
        print("Validation Loss:%.3f" % val_loss)
        print("Validation Accuracy:%.3f" % val_acc)
        print('Teacher_ratio: %.3f' % teacher_ratio)
        # print('Model training completed and save at %s' % flags.checkpoint_path)  # state = {'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': opt.state_dict()}  # if not os.path.exists(flags.checkpoint_path):  #     os.mkdir(flags.checkpoint_path)  # torch.save(state, flags.checkpoint_path + '/model.pt')  # data_processor.save(flags.checkpoint_path)  # return model, loss_history
        print('Gradients:%.3f' % torch.mean(
            (torch.stack([torch.mean(torch.abs(p.grad)) for p in model_params], 0))))
        print()
def main_execute():

    trading_advisor = TradingAdvisor()

    intervals = [20, 60, 120]
    processor = DataProcessor(intervals)
    subset_df = processor.get_dataframe_subset(500)

    advisor.calc_buy_sell(intervals, subset_df)

    processor.plot(subset_df)
    processor.db.close()
Ejemplo n.º 6
0
def main_execute():

    advisor = Advisor()

    intervals = [60, 120, 720]
    processor = DataProcessor(intervals)
    subset_df = processor.get_dataframe_subset(3500)

    advisor.calc_buy_sell(intervals, subset_df)

    processor.plot(subset_df)
    processor.db.close()
Ejemplo n.º 7
0
    def __init__(self, split, args):
        """
        Initialise the MetaLearningDataset class

        Parameters
        ----------
        split : str
            'train' or 'test'
        args : parsed args
            args passed in to the model
        """
        self.data_dir = "../../../datascience-projects/internal/multitask_learning/processed_data/Streetbees_Mood/Streetbees_Mood_all.csv"
        data = pd.read_csv(self.data_dir, index_col=0)
        self.data_processor = DataProcessor(data_dir=None, data_name='Streetbees_Mood', labels=['0','1'])
        self.K = args.K
        self.num_classes = args.num_classes

        self.baseLM_name = args.model_name.split("-")[0]
        do_lower_case = False if args.model_name.split("-")[-1] == "cased" else True
        self.tokenizer = TOKENIZERS[self.baseLM_name].from_pretrained(args.model_name, do_lower_case=do_lower_case)

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Pick categories for dev and test set
        ALL_CATEGORIES = np.unique(data['category'])
        TEST_CATEGORIES = ['Healthy', 'Positive', 'Unwell', 'Fine']

        self.tasks = {}
        for category in ALL_CATEGORIES:
            if (split == 'train' and category in TEST_CATEGORIES) or (split == 'test' and category not in TEST_CATEGORIES):
                continue
            # Each category will become a separate task. Explicitly redefine variable below for clarity
            task = category
            pos_examples, neg_examples = self.get_positive_and_negative_examples(data, category=task)
            if task not in self.tasks:
                self.tasks[task] = (pos_examples, neg_examples)

        task_list = []
        task_names = []
        for task in self.tasks:
            pos_examples = self.tasks[task][0]
            neg_examples = self.tasks[task][1]
            if len(pos_examples) < self.K or len(neg_examples) < self.K:
                print('not enough examples', task)
                continue # skip for now if not enough examples
            task_list.append((pos_examples, neg_examples))
            task_names.append(task)

        self.tasks = task_list
        self.task_names = task_names
        self.num_tasks = len(self.tasks)
Ejemplo n.º 8
0
def infer(raw, flags):
    """

    :param raw:
    :param flags:
    :return:
    """
    data_processor = DataProcessor.load(flags.checkpoint_path)
    model = LSTM(data_processor.num_features,
                 flags.num_units,
                 output_dim=flags.output_dim,
                 num_layers=flags.num_layers,
                 batch_first=True,
                 dropout=flags.dropout)
    model, _, _ = load_checkpoint(flags.checkpoint_path, model, None)
    model.eval()
    # predict
    results = {}
    loader, ts_types = data_processor.get_forecast_data(raw)
    with torch.no_grad():
        for type, data in zip(ts_types, loader):
            scale = data[4]
            _, outputs = _infer(data, model, flags.window,
                                flags.forecast_length)
            results[type] = [(output * scale).detach().numpy()[0]
                             for output in outputs]
    print(results)
    return results
Ejemplo n.º 9
0
    def run_k_folds(self, n_runs=5, n_folds=2):
        dp = DataProcessor()
        dp.load('data/SQuAD/squad-v7.file')
        model = LogRegModel()
        model.load_vectors(dp.articles, n_folds=n_folds)

        baseline_results = []
        sentiment_results = []

        for run in range(n_runs):
            print("k-fold run:", run)
            baseline_results.append(model.run_k_fold(with_sentiment=False))
            sentiment_results.append(model.run_k_fold())
            model.create_new_folds(n_folds=n_folds)

        self.save_results(baseline_results, "results/5x2_baseline")
        self.save_results(sentiment_results, "results/5x2_sentiment")
Ejemplo n.º 10
0
def get_data(
        model_args,
        training_args,
        tokenizer,
        text_data_path="../data/test_dataset"):  # 경로 변경 ../data/test_dataset
    """
    get data

    Args:
        model_args: model arguments
        training_args: training arguments
        tokenizer: tokenizer
        text_data_path: Defaults to "../data/test_dataset"

    Returns:
        text_data, val_iter, val_dataset, scores
    """
    text_data = load_from_disk(text_data_path)

    # run_ lasticsearch
    if "elastic" in model_args.retrieval_type:
        is_sentence_trainformer = False
        if "sentence_trainformer" in model_args.retrieval_type:
            is_sentence_trainformer = True
        # number of text to concat
        concat_num = model_args.retrieval_elastic_num
        text_data, scores = run_elasticsearch(text_data, concat_num,
                                              model_args,
                                              is_sentence_trainformer)
    elif model_args.retrieval_type == "dense":
        concat_num = model_args.retrieval_elastic_num
        text_data, scores = run_concat_dense_retrival(text_data, concat_num)

    column_names = text_data["validation"].column_names

    data_collator = (DataCollatorWithPadding(
        tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None))
    # 데이터 tokenize(mrc 모델안에 들어 갈 수 있도록)
    data_processor = DataProcessor(tokenizer)
    val_text = text_data["validation"]
    val_dataset = data_processor.val_tokenzier(val_text, column_names)
    val_iter = DataLoader(val_dataset, collate_fn=data_collator, batch_size=1)

    return text_data, val_iter, val_dataset, scores
Ejemplo n.º 11
0
 def __init__(self, ip):
     self.log = Logger(MAIN_CLIENT_LOG_FILE, D_VERB)
     self.log.info('[MAIN THREAD] Instantiated client')
     self.receiving = False
     self.define_headers()
     self.targets = {}
     self.transmit = Queue.Queue()
     self.data_client = DataClient(self.transmit, ip)
     self.data_processor = DataProcessor(self.transmit, self.headers, self.targets)
     self.connect(ip)
def load_rf_data(cur_path):
    data_folder = "data\\titanic"
    processed_data_folder = os.path.join(cur_path, data_folder)
    # Note: Not using test.csv as it does not provide whether or not the passenger survived; therefore we cannot assess
    #       how well the model performed.
    data_file_path = os.path.join(processed_data_folder, "train.csv")
    data = DataProcessor(data_file_path, processed_data_folder)

    try:
        #Try to load data
        data.load_processed_data()
    except FileNotFoundError:
        #No data found, so process it
        # 10% test, 10% validation, 80% training samples from data
        splits = (0.1, 0.1, 0.8)
        # Only use certain columns
        use_cols = (  # 0, #PassengerID
            1,  # Survived
            2,  # Pclass
            # 3, #Name
            4,  # Sex
            5,  # Age
            6,  # SibSp
            7,  # Parch
            # 8, #Ticket
            9,  # Fare
            # 10, #Cabin
            11,  # Embarked
        )
        # Mark features as categorical (so we can one-hot-encode them later)
        # categorical_cols = ()
        categorical_cols = (2,  # Pclass
                            4,  # Sex
                            11  # Embarked
                            )
        # Convert certain columns to float values (so we can use numpy arrays)
        converters = {4: lambda sex: {'male': 0.0, 'female': 1.0}[sex],
                      11: lambda embarked: {'S': 0.0, 'C': 1.0, 'Q': 2.0}[embarked]}
        data.process_data(splits=splits, use_cols=use_cols, categorical_cols=categorical_cols, converters=converters,
                          filter_missing=True)
    return data
def run_gru(s):

    x_dims = len(x_cols[s.dataSet]) if s.dataSet in x_cols else s.lookback
    random.seed(6)
    np.random.seed(6)
    rnn = Sequential()
    rnn.add(
        GRU(s.nodes,
            input_shape=(None, x_dims),
            kernel_initializer='he_uniform',
            stateful=False))

    #rnn.add(Dropout(0.15))
    rnn.add(Dense(1, kernel_initializer='he_uniform'))
    opt = adam(lr=s.lr, decay=0.0)  #1e-3)
    rnn.compile(loss='mae', optimizer=opt)

    # prepare dataset as pyBrain sequential dataset
    sequence = readDataSet(s.dataSet, s.dataSetDetailed, s)
    if s.limit_to:
        sequence = sequence[:s.limit_to]

    dp = DataProcessor()
    # standardize data by subtracting mean and dividing by std
    #(meanSeq, stdSeq) = dp.normalize('data', sequence)

    dp.windowed_normalize(sequence)

    for key in sequence.keys():
        if key != "data":
            dp.normalize(key, sequence)

    predictedInput = np.zeros((len(sequence), ))
    targetInput = np.zeros((len(sequence), ))
    trueData = np.zeros((len(sequence), ))

    if s.dataSet in differenceSets:
        predictedInputNodiff = np.zeros((len(sequence), ))
        targetInputNodiff = np.zeros((len(sequence), ))

    if s.dataSet in differenceSets:
        backup_sequence = sequence
        sequence = dp.difference(sequence, s.lookback)

    allX = getX(sequence, s)
    allY = np.array(sequence['data'])

    allX = allX[48:]
    allY = allY[48:]
    #if s.dataSet not in x_cols:
    #    allY = allY[s.lookback:]
    trainX = allX[0:s.nTrain]
    trainY = allY[s.predictionStep:s.nTrain + s.predictionStep]
    trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
    curBatch = 1.0
    callback = LossCallback()
    temp_set = np.array(sequence['data'])[:48 + s.nTrain + 5]
    configure_batches(48, s.batch_size,
                      np.reshape(temp_set, (temp_set.shape[0], 1, 1)))
    rnn.fit(trainX,
            trainY,
            epochs=s.epochs,
            batch_size=s.batch_size,
            verbose=min(s.max_verbosity, 2),
            callbacks=[callback])
    for i in xrange(0, s.nTrain):
        targetInput[i] = allY[i + s.predictionStep]

    for i in tqdm(xrange(s.nTrain + s.predictionStep, len(allX)),
                  disable=s.max_verbosity == 0):
        if i % s.retrain_interval == 0 and i > s.numLags + s.nTrain and s.online:
            trainX = allX[i - s.nTrain - s.predictionStep:i - s.predictionStep]
            trainY = allY[i - s.nTrain:i]
            trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
            temp_set = np.array(sequence['data'])[i - s.nTrain -
                                                  s.predictionStep - 48:i]
            configure_batches(48, s.batch_size,
                              np.reshape(temp_set, (temp_set.shape[0], 1, 1)))
            rnn.fit(trainX,
                    trainY,
                    epochs=s.epochs,
                    batch_size=s.batch_size,
                    verbose=2,
                    callbacks=[callback])

        targetInput[i] = allY[i + s.predictionStep]
        predictedInput[i] = rnn.predict(np.reshape(allX[i], (1, 1, x_dims)))
        if s.dataSet in differenceSets:
            predictedInputNodiff[i] = predictedInput[i]
            targetInputNodiff[i] = targetInput[i]
            predictedInput[i] = dp.inverse_difference(backup_sequence['data'],
                                                      predictedInput[i], i - 1)
            targetInput[i] = dp.inverse_difference(backup_sequence['data'],
                                                   targetInput[i], i - 1)
        predictedInput[0] = 0
        trueData[i] = sequence['data'][i]

    #predictedInput = dp.denormalize(predictedInput, meanSeq, stdSeq)
    #targetInput = dp.denormalize(targetInput, meanSeq, stdSeq)
    dp.windowed_denormalize(predictedInput, targetInput)
    if s.dataSet in differenceSets:

        # predictedInputNodiff = dp.denormalize(predictedInputNodiff)
        # targetInputNodiff = dp.denormalize(targetInputNodiff)
        pass
    #trueData = (trueData * stdSeq) + meanSeq

    dp.saveResultToFile(s.dataSet, predictedInput, targetInput, 'gru',
                        s.predictionStep, s.max_verbosity)
    skipTrain = error_ignore_first[s.dataSet]
    from plot import computeSquareDeviation
    squareDeviation = computeSquareDeviation(predictedInput, targetInput)
    squareDeviation[:skipTrain] = None
    nrmse = np.sqrt(np.nanmean(squareDeviation)) / np.nanstd(targetInput)
    if s.max_verbosity > 0:
        print "", s.nodes, "NRMSE {}".format(nrmse)
    mae = np.nanmean(np.abs(targetInput - predictedInput))
    if s.max_verbosity > 0:
        print "MAE {}".format(mae)

    if s.dataSet in differenceSets:
        dp.saveResultToFile(s.dataSet, predictedInputNodiff, targetInputNodiff,
                            'gru_nodiff', s.predictionStep, s.max_verbosity)
        squareDeviation = computeSquareDeviation(predictedInputNodiff,
                                                 targetInputNodiff)
        squareDeviation[:skipTrain] = None
        nrmse = np.sqrt(
            np.nanmean(squareDeviation)) / np.nanstd(targetInputNodiff)
        if s.max_verbosity > 0:
            print "", s.nodes, "NRMSE {}".format(nrmse)
        mae = np.nanmean(np.abs(targetInputNodiff - predictedInputNodiff))
        if s.max_verbosity > 0:
            print "MAE {}".format(mae)
        mase = errors.get_mase(predictedInput, targetInput,
                               np.roll(targetInput, 24))
        if s.max_verbosity > 0:
            print "MAE {}".format(mae)
    return nrmse
Ejemplo n.º 14
0
def train(raw, flags):
    data_processor = DataProcessor(flags.forecast_length, flags.batch_size,
                                   flags.window)
    train_loader, val_loader = data_processor.get_train_test_data(
        raw, if_scale=True, val_ratio=flags.validation_ratio)

    model = LSTM(data_processor.num_features,
                 flags.num_units,
                 output_dim=flags.output_dim,
                 num_layers=flags.num_layers,
                 batch_first=True,
                 dropout=flags.dropout)

    if flags.loss == 'mse':
        loss_fn = torch.nn.MSELoss()
    else:
        loss_fn = SMAPE()

    opt = torch.optim.Adam(model.parameters(), lr=flags.learning_rate)

    teacher_ratio = flags.teacher_ratio
    loss_history = []

    model, opt, start_epoch = load_checkpoint(flags.checkpoint_path, model,
                                              opt)
    if start_epoch >= flags.num_epochs:
        print('start_epoch is larger than num_epochs!')
    epoch = start_epoch
    # TODO: add early stop
    for epoch in range(start_epoch, flags.num_epochs):
        for step, data in enumerate(train_loader):
            avg_loss, _, acc = _forward(data, model, loss_fn, flags.window,
                                        flags.forecast_length, True,
                                        teacher_ratio)
            loss_history.append(avg_loss)
            opt.zero_grad()
            avg_loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
            opt.step()
            teacher_ratio *= flags.teacher_ratio_decay
        val_loss, val_acc = evaluate(val_loader, model, loss_fn, flags.window,
                                     flags.forecast_length)
        print('Epoch: %d' % epoch)
        print("Training Loss:%.3f" % avg_loss)
        print('Training Avg Accuracy:%.3f' % acc)
        print("Validation Loss:%.3f" % val_loss)
        print("Validation Accuracy:%.3f" % val_acc)
        print('Teacher_ratio: %.3f' % teacher_ratio)
        print('Gradients:%.3f' % torch.mean((torch.stack(
            [torch.mean(torch.abs(p.grad)) for p in model.parameters()], 0))))
        print()

    print('Model training completed and save at %s' % flags.checkpoint_path)
    state = {
        'epoch': epoch + 1,
        'state_dict': model.state_dict(),
        'optimizer': opt.state_dict()
    }
    if not os.path.exists(flags.checkpoint_path):
        os.mkdir(flags.checkpoint_path)
    torch.save(state, flags.checkpoint_path + '/model.pt')
    data_processor.save(flags.checkpoint_path)
    return model, loss_history
def run_gru(s):
    s.print_settings()
    prob = tf.placeholder_with_default(
        1.0, shape=())  #Retain probability for TF dropout

    start_time = timeit.default_timer()

    if s.implementation == "keras":
        if s.use_binary:
            raise Exception("Binary Keras not implemented")

        input = Input(shape=(1, s.x_dims))
        dense1 = Dense(s.nodes, activation='sigmoid')(input)
        dense2 = Dense(s.nodes, activation='sigmoid')(input)
        dense3 = Dense(s.nodes, activation='tanh')(input)
        mult1 = Multiply()([dense2, dense3])
        act1 = Activation('tanh')(mult1)
        mult2 = Multiply()([dense1, act1])
        reshape = Reshape((s.nodes, ))(mult2)
        dropout = Dropout(0.5)(reshape)
        dense_out = Dense(1)(dropout)
        rnn = Model(inputs=[input], outputs=[dense_out])

        opt = adam(lr=s.lr, decay=0.0,
                   epsilon=s.adam_eps)  #, clipvalue=1.)#1e-3)
        #opt = rmsprop(lr=s.lr)
        rnn.compile(loss=s.loss, optimizer=opt)
        if s.max_verbosity > 0:
            print(rnn.summary())

    else:
        raise Exception("Unknown implementation " + s.implementation)

    sequence = readDataSet(s.dataSet, s.dataSetDetailed, s).values

    if s.limit_to:
        sequence = sequence[:s.limit_to]

    #Get rid of unneeded columns
    sequence = sequence[:, 0:s.feature_count]

    #sequence[-1000,0] = 666
    #print "Changed -1000 to 666"
    """
    We need to leave some values unpredicted in front so that
      - We can fill the lookback window for each prediction
      - We can get the value from 1 season earlier for MASE
    --> Don't use the first `front_buffer` values as prediction
    --> Independent from `prediction_step`, so the first actual value predicted is `front_buffer`\
        plus however many steps the `prediction_step` is higher than 1
        In other words, the most recent X-value for the first prediction will be the final value in the `front_buffer`
    """

    first_prediction_index = s.front_buffer + s.predictionStep - 1

    targetInput = sequence[
        first_prediction_index:,
        0].copy()  #grab this now to avoid having to denormalize

    dp = DataProcessor()
    if s.normalization_type == 'default':
        (meanSeq, stdSeq) = dp.normalize(
            sequence, s.nTrain if s.cutoff_normalize else len(sequence))

    elif s.normalization_type == 'windowed':
        dp.windowed_normalize(sequence, columns=[0])
        if s.feature_count > 1:
            dp.normalize(sequence, s.nTrain, columns=range(1, s.feature_count))
    elif s.normalization_type == 'AN':
        an = AdaptiveNormalizer(s.lookback, s.lookback + s.predictionStep)
        an.set_pruning(False)
        an.set_source_data(sequence, s.nTrain)
        an.do_ma('s')
        an.do_stationary()
        an.remove_outliers()
        seq_norm = an.do_adaptive_normalize()
        print seq_norm.shape
        if s.feature_count > 1:
            dp.normalize(sequence, s.nTrain, columns=range(1, s.feature_count))
            start = sequence.shape[0] - seq_norm.shape[
                0] - s.lookback - s.predictionStep + 1
            for i in range(seq_norm.shape[0]):
                seq_norm[i, :,
                         1:s.feature_count] = sequence[start + i:start + i +
                                                       seq_norm.shape[1],
                                                       1:s.feature_count]
        #an.do_ma('s')
        #an.do_stationary()
        #an.remove_outliers()
        #seq_norm = an.do_adaptive_normalize()
        #print seq_norm[15000,0,0]
        #exit(1)

    else:
        raise Exception("Unsupported normalization type: " +
                        s.normalization_type)

    #seq_actual = sequence[s.front_buffer:] #Leave enough headroom for MASE calculation and lookback
    #seq_full_norm = np.reshape(sequence[:,0], (sequence.shape[0],))
    #seq_actual_norm = seq_full_norm[s.front_buffer:]
    if s.normalization_type != "AN":
        #Default and windowed change the seq itself but still require creating lookback frames
        allX = getX(sequence, s)
        allY = sequence[first_prediction_index:, 0]
    else:
        #AN creates a new array but takes care of lookback internally
        allX = seq_norm[:, 0:-s.predictionStep]
        allY = np.reshape(seq_norm[:, -1, 0], (-1, ))
    predictedInput = np.full((len(allY), ),
                             np.nan)  #Initialize all predictions to NaN
    #print "TESTT", allX[15000,0,1:]
    print "FIRST", allX[875]
    trainX = allX[:s.nTrain]
    trainY = allY[:s.nTrain]
    #print "FIRST", trainX[0], trainY[0]
    trainX = np.reshape(trainX, s.actual_input_shape_train)
    trainY = np.reshape(trainY, s.actual_output_shape_train)

    #print "FIRST", trainX[0], trainY[0]
    if s.implementation == "keras":
        #for _ in tqdm(range(s.epochs)):
        for _ in range(1):
            rnn.fit(
                trainX,
                trainY,
                epochs=s.epochs,
                batch_size=s.batch_size,
                verbose=min(s.max_verbosity, 2),
                shuffle=not s.stateful
            )  #, validation_data=(trainX, trainY), callbacks=[TensorBoard(log_dir='./logs', histogram_freq=1, write_grads=True)])
            if s.stateful:
                rnn_layer.reset_states()

    # for layer in rnn.layers:
    #     print layer.get_weights()
    #for i in xrange(0, s.nTrain + s.predictionStep):
    #   rnn.predict(np.reshape(allX[i], (1, 1, x_dims)))
    #predictedInput[s.nTrain + s.predictionStep : len(allX)] =  rnn.predict(np.reshape(allX[s.nTrain + s.predictionStep : len(allX)], (1, 12510, x_dims)))
    latestStart = None
    do_non_lookback = True
    latest_onego = 0
    #buffer = s.retrain_interval / 2
    buffer = 0
    for i in tqdm(xrange(s.nTrain + s.predictionStep, len(allX)),
                  disable=s.max_verbosity == 0):
        if i % s.retrain_interval == 0 and s.online and i > s.nTrain + s.predictionStep + buffer:
            do_non_lookback = True
            if s.normalization_type == 'AN':
                #print "TEST", seq_norm[15000,0,1]
                predictedInput = np.array(
                    an.do_adaptive_denormalize(
                        predictedInput, therange=(i - s.retrain_interval, i)))
                latestStart = i
                an.set_ignore_first_n(i - s.nTrain - s.predictionStep)
                an.do_ma('s')
                an.do_stationary()
                an.remove_outliers()
                seq_norm = an.do_adaptive_normalize()
                print seq_norm[15000, 0, 0]
                print seq_norm.shape
                #exit(1)
                #print "FIRST", seq_norm[i-s.nTrain -s.predictionStep,0]#, trainY[0]
                #print sequence[start+i-s.nTrain-s.predictionStep:start+
                if s.feature_count > 1:
                    #dp.normalize(sequence, s.nTrain, columns=range(1,s.feature_count))
                    start = sequence.shape[0] - seq_norm.shape[
                        0] - s.lookback - s.predictionStep + 1
                    for j in range(seq_norm.shape[0]):
                        seq_norm[j, :, 1:s.feature_count] = sequence[
                            start + j:start + j + seq_norm.shape[1],
                            1:s.feature_count]
                #print "FIRST", seq_norm[i-s.nTrain -s.predictionStep,0]#, trainY[0]
                allX = seq_norm[:, 0:-s.predictionStep]
                allY = np.reshape(seq_norm[:, -1, 0], (-1, ))

            if s.lookback:
                trainX = allX[i - s.nTrain - s.predictionStep:i -
                              s.predictionStep]
                trainY = allY[i - s.nTrain - s.predictionStep:i -
                              s.predictionStep]
            else:
                trainX = allX[i - s.nTrain - s.predictionStep:i -
                              s.predictionStep]
                trainY = allY[i - s.nTrain - s.predictionStep:i -
                              s.predictionStep]
            #print "TESTT", allX[15000,0,:]
            print "at", i - s.nTrain - s.predictionStep
            print "FIRST", allX[875]  #, trainY[0]
            #exit(1)
            trainX = np.reshape(trainX, s.actual_input_shape_train)
            trainY = np.reshape(trainY, s.actual_output_shape_train)
            #print "FIRST", trainX[0], trainY[0]
            #exit(1)
            if s.implementation == "keras":
                if s.reset_on_retrain:
                    input = Input(shape=(1, s.x_dims))
                    dense1 = Dense(s.nodes, activation='sigmoid')(input)
                    dense2 = Dense(s.nodes, activation='sigmoid')(input)
                    dense3 = Dense(s.nodes, activation='tanh')(input)
                    mult1 = Multiply()([dense2, dense3])
                    act1 = Activation('tanh')(mult1)
                    mult2 = Multiply()([dense1, act1])
                    reshape = Reshape((s.nodes, ))(mult2)
                    dropout = Dropout(0.5)(reshape)
                    dense_out = Dense(1)(dropout)
                    rnn = Model(inputs=[input], outputs=[dense_out])
                    opt = adam(lr=s.lr, decay=0.0,
                               epsilon=s.adam_eps)  # , clipvalue=1.)#1e-3)
                    #opt = rmsprop(lr=s.lr)
                    rnn.compile(loss=s.loss, optimizer=opt)
                for _ in range(1):
                    rnn.fit(trainX,
                            trainY,
                            epochs=s.epochs_retrain
                            if s.epochs_retrain else s.epochs,
                            batch_size=s.batch_size,
                            verbose=2,
                            shuffle=not s.stateful)
                    if s.stateful:
                        rnn_layer.reset_states()

        if s.lookback:
            if s.implementation == "keras":
                predictedInput[i] = rnn.predict(
                    np.reshape(allX[i], s.predict_input_shape))

        elif do_non_lookback:
            do_non_lookback = False
            up_to = min(allX.shape[0], i - (i % s.retrain_interval) +
                        s.retrain_interval) if s.online else allX.shape[0]
            start_time = time.time()
            #print allX[0]
            start = 0 if s.refeed_on_retrain else latest_onego
            new_predicts = rnn.predict(
                np.reshape(allX[start:up_to], (1, -1, s.x_dims)))
            new_predicts = np.reshape(new_predicts, (new_predicts.shape[1], ))
            predictedInput[i:up_to] = new_predicts[-(up_to - i):]
            latest_onego = up_to

    for i in range(s.nTrain + s.predictionStep):
        predictedInput[i] = np.nan

    if s.normalization_type == 'default':
        predictedInput = dp.denormalize(predictedInput, meanSeq[0], stdSeq[0])
    elif s.normalization_type == 'windowed':
        dp.windowed_denormalize(predictedInput, pred_step=s.predictionStep)
    elif s.normalization_type == 'AN':
        if latestStart:
            predictedInput = np.array(
                an.do_adaptive_denormalize(predictedInput,
                                           therange=(latestStart,
                                                     len(predictedInput))))
        else:
            predictedInput = np.array(
                an.do_adaptive_denormalize(predictedInput))
        if an.pruning:
            targetInput = np.delete(targetInput, an.deletes)

    print "Final time", (timeit.default_timer() - start_time)

    #print "Last not to change:", predictedInput[-996], targetInput[-996]
    #print "First to change:", predictedInput[-995], targetInput[-995]
    dp.saveResultToFile(s.dataSet, predictedInput, targetInput, 'gru',
                        s.predictionStep, s.max_verbosity)
    for ignore in s.ignore_for_error:
        skipTrain = ignore
        from plot import computeSquareDeviation
        squareDeviation = computeSquareDeviation(predictedInput, targetInput)
        squareDeviation[:skipTrain] = None
        nrmse = np.sqrt(np.nanmean(squareDeviation)) / np.nanstd(targetInput)
        if s.max_verbosity > 0:
            print "", s.nodes, "NRMSE {}".format(nrmse)
        mae = np.nanmean(np.abs(targetInput - predictedInput))
        if s.max_verbosity > 0:
            print "MAE {}".format(mae)
        mape = errors.get_mape(predictedInput, targetInput, skipTrain)
        if s.max_verbosity > 0:
            print "MAPE {}".format(mape)
        mase = errors.get_mase(predictedInput, targetInput,
                               np.roll(targetInput, s.season), skipTrain)
        if s.max_verbosity > 0:
            print "MASE {}".format(mase)

    return mase
Ejemplo n.º 16
0
def data_preprocess():
    with DataProcessor() as dp:
        line_file_path = os.path.join(DATA_PATH, LINE_FILE)
        conv_file_path = os.path.join(DATA_PATH, CONV_FILE)
        dp.prepare_text_data(line_file_path, conv_file_path, PROCESSED_PATH, TESTSET_SIZE)
        dp.process_data(PROCESSED_PATH, THRESHOLD, UNK_SYM, SOS, EOS)
Ejemplo n.º 17
0
import json
from flask import Flask, abort, request

from db_engine import Engine
from data_processing import DataProcessor

app = Flask(__name__)
engine = Engine()
processor = DataProcessor()


@app.after_request
def build_response(resp):
    resp.headers['Access-Control-Allow-Origin'] = '*'
    resp.headers[
        'Access-Control-Allow-Headers'] = 'Origin, X-Requested-With, Content-Type, Accept'
    resp.headers['Access-Control-Allow-Methods'] = 'GET, POST, DELETE'
    return resp


# Here begins the routing and corresponding calls


@app.route('/scheduleforseason=<int:season>')
def get_schedule(season):
    """ send the league's schedule for a season to front end """
    # validation: converter <int:x> forces all values to ints, rejects otherwise
    # season must be a 4-character year, i.e. 2017
    schedule = []
    try:
        schedule = engine.get_league_schedule_for_season(season)
Ejemplo n.º 18
0
class MetaLearningDataset(Dataset):
    def __init__(self, split, args):
        """
        Initialise the MetaLearningDataset class

        Parameters
        ----------
        split : str
            'train' or 'test'
        args : parsed args
            args passed in to the model
        """
        self.data_dir = "../../../datascience-projects/internal/multitask_learning/processed_data/Streetbees_Mood/Streetbees_Mood_all.csv"
        data = pd.read_csv(self.data_dir, index_col=0)
        self.data_processor = DataProcessor(data_dir=None, data_name='Streetbees_Mood', labels=['0','1'])
        self.K = args.K
        self.num_classes = args.num_classes

        self.baseLM_name = args.model_name.split("-")[0]
        do_lower_case = False if args.model_name.split("-")[-1] == "cased" else True
        self.tokenizer = TOKENIZERS[self.baseLM_name].from_pretrained(args.model_name, do_lower_case=do_lower_case)

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Pick categories for dev and test set
        ALL_CATEGORIES = np.unique(data['category'])
        TEST_CATEGORIES = ['Healthy', 'Positive', 'Unwell', 'Fine']

        self.tasks = {}
        for category in ALL_CATEGORIES:
            if (split == 'train' and category in TEST_CATEGORIES) or (split == 'test' and category not in TEST_CATEGORIES):
                continue
            # Each category will become a separate task. Explicitly redefine variable below for clarity
            task = category
            pos_examples, neg_examples = self.get_positive_and_negative_examples(data, category=task)
            if task not in self.tasks:
                self.tasks[task] = (pos_examples, neg_examples)

        task_list = []
        task_names = []
        for task in self.tasks:
            pos_examples = self.tasks[task][0]
            neg_examples = self.tasks[task][1]
            if len(pos_examples) < self.K or len(neg_examples) < self.K:
                print('not enough examples', task)
                continue # skip for now if not enough examples
            task_list.append((pos_examples, neg_examples))
            task_names.append(task)

        self.tasks = task_list
        self.task_names = task_names
        self.num_tasks = len(self.tasks)

    @staticmethod
    def get_positive_and_negative_examples(data, category):
        positive_examples = data[(data['category'] == category) & (data['label'] == 1)]
        negative_examples = data[(data['category'] == category) & (data['label'] == 0)]
        return positive_examples.drop(columns='category'), negative_examples.drop(columns='category')

    def __getitem__(self, task_index):
        # choose the task indicated by index
        pos_examples, neg_examples = self.tasks[task_index]

        # for now just choose randomly among examples
        pos_indices = np.random.choice(range(len(pos_examples)), size=self.K)
        neg_indices = np.random.choice(range(len(neg_examples)), size=self.K)

        # # interleave randomly - DIFFERENT FROM RANDOMLY SHUFFLING
        # examples = np.empty((self.K*2, 2), dtype=pos.dtype)
        # if np.random.uniform() > .5:
        #     examples[0::2, :] = pos
        #     examples[1::2, :] = neg
        # else:
        #     examples[0::2, :] = neg
        #     examples[1::2, :] = pos

        # Randomly shuffle positive and negative examples for now
        all_examples = pd.concat([pos_examples.iloc[pos_indices, :],
                                  neg_examples.iloc[neg_indices, :]]).sample(frac=1)
        train_examples = self.data_processor.get_examples(input_df=all_examples.iloc[:self.K, :], set_type='train')
        test_examples = self.data_processor.get_examples(input_df=all_examples.iloc[self.K:, :], set_type='test')

        train_features = convert_examples_to_features(train_examples, label_list=['0','1'], max_seq_length=128,
                                                      tokenizer=self.tokenizer, task_name='Streetbees_Mood',
                                                      model_name=self.baseLM_name, do_logging=False)
        all_input_ids = torch.tensor([feature.input_ids for feature in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([feature.segment_ids for feature in train_features], dtype=torch.long)
        all_input_masks = torch.tensor([feature.input_mask for feature in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([feature.label_id for feature in train_features], dtype=torch.long)
        task_train_data = {'input_ids': all_input_ids, 'segment_ids': all_segment_ids,
                           'input_masks':  all_input_masks, 'label_ids': all_label_ids}

        test_features = convert_examples_to_features(test_examples, label_list=['0','1'], max_seq_length=128,
                                                      tokenizer=self.tokenizer, task_name='Streetbees_Mood',
                                                      model_name=self.baseLM_name, do_logging=False)
        all_input_ids = torch.tensor([feature.input_ids for feature in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([feature.segment_ids for feature in train_features], dtype=torch.long)
        all_input_masks = torch.tensor([feature.input_mask for feature in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([feature.label_id for feature in train_features], dtype=torch.long)
        task_test_data = {'input_ids': all_input_ids, 'segment_ids': all_segment_ids,
                          'input_masks':  all_input_masks, 'label_ids': all_label_ids}

        return task_train_data, task_test_data

    def __len__(self):
        return len(self.tasks)
Ejemplo n.º 19
0
def get_data(data_args, training_args, tokenizer):
    '''train과 validation의 dataloader와 dataset를 반환하는 함수'''
    if data_args.dataset_name == 'basic':
        if os.path.isdir("../data/train_dataset"):
            dataset = load_from_disk("../data/train_dataset")
        else:
            raise Exception("Set the data path to 'p3-mrc-team-ikyo/data/.'")
    elif data_args.dataset_name == 'preprocessed':
        if os.path.isfile("../data/preprocess_train.pkl"):
            dataset = get_pickle("../data/preprocess_train.pkl")
        else:
            dataset = make_custom_dataset("../data/preprocess_train.pkl")
    elif data_args.dataset_name == 'concat':
        if os.path.isfile("../data/concat_train.pkl"):
            dataset = get_pickle("../data/concat_train.pkl")
        else:
            dataset = make_custom_dataset("../data/concat_train.pkl")
    elif data_args.dataset_name == 'korquad':
        if os.path.isfile("../data/korquad_train.pkl"):
            dataset = get_pickle("../data/korquad_train.pkl")
        else:
            dataset = make_custom_dataset("../data/korquad_train.pkl")
    elif data_args.dataset_name == "question_type":
        if os.path.isfile("../data/question_type.pkl"):
            dataset = get_pickle("../data/question_type.pkl")
        else:
            dataset = make_custom_dataset("../data/question_type.pkl")
    elif data_args.dataset_name == "ai_hub":
        if os.path.isfile("../data/ai_hub_dataset.pkl"):
            dataset = get_pickle("../data/ai_hub_dataset.pkl")
        else:
            dataset = make_custom_dataset("../data/ai_hub_dataset.pkl")
    elif data_args.dataset_name == "only_korquad":
        dataset = load_dataset("squad_kor_v1")
    elif data_args.dataset_name == "random_masking":
        if os.path.isfile("../data/random_mask_train.pkl"):
            dataset = get_pickle("../data/random_mask_train.pkl")
        else:
            dataset = make_custom_dataset("../data/random_mask_train.pkl")
    elif data_args.dataset_name == "token_masking":
        if os.path.isfile("../data/concat_token_mask_top_3.pkl"):
            dataset = get_pickle("../data/concat_token_mask_top_3.pkl")
        else:
            dataset = make_mask_dataset("../data/concat_token_mask_top_3.pkl",
                                        tokenizer)
        train_dataset = dataset['train']
        val_dataset = dataset['validation']
    else:
        raise Exception(
            "dataset_name have to be one of ['basic', 'preprocessed', 'concat', 'korquad', 'only_korquad', 'question_type', 'ai_hub', 'random_masking', 'token_masking']"
        )

    if data_args.dataset_name != "token_masking":
        train_dataset = dataset['train']
        val_dataset = dataset['validation']
        train_column_names = train_dataset.column_names
        val_column_names = val_dataset.column_names

        data_processor = DataProcessor(tokenizer, data_args.max_seq_length,
                                       data_args.doc_stride)
        train_dataset = data_processor.train_tokenizer(train_dataset,
                                                       train_column_names)
        val_dataset = data_processor.val_tokenzier(val_dataset,
                                                   val_column_names)

    data_collator = (DataCollatorWithPadding(
        tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None))
    train_iter = DataLoader(
        train_dataset,
        collate_fn=data_collator,
        batch_size=training_args.per_device_train_batch_size)
    val_iter = DataLoader(val_dataset,
                          collate_fn=data_collator,
                          batch_size=training_args.per_device_eval_batch_size)

    return dataset, train_iter, val_iter, train_dataset, val_dataset
Ejemplo n.º 20
0
def run_gru(s):
    global global_step
    global increment_global_step_op
    global reset_global_step_op
    global batches
    global images_placeholder
    global batches_op
    global_step = tf.Variable(0,
                              name='global_step',
                              trainable=False,
                              dtype=tf.int32)
    increment_global_step_op = tf.assign(global_step, global_step + 1)
    reset_global_step_op = tf.assign(global_step, 0)

    batches = tf.get_variable(
        "batches", [s.nTrain / int(s.batch_size), s.batch_size, 1, 1],
        dtype=tf.float32,
        initializer=tf.zeros_initializer)
    images_placeholder = tf.placeholder(tf.float32,
                                        shape=(s.nTrain / int(s.batch_size),
                                               s.batch_size, 1, 1))
    batches_op = tf.assign(batches, images_placeholder)

    x_dims = len(x_cols[s.dataSet]) if s.dataSet in x_cols else s.lookback
    random.seed(6)
    np.random.seed(6)
    rnn = Sequential()
    rnn.add(
        GRU(s.nodes,
            input_shape=(None, x_dims),
            kernel_initializer='he_uniform',
            stateful=False))

    #rnn.add(Dropout(0.15))
    rnn.add(Dense(1, kernel_initializer='he_uniform'))
    opt = adam(lr=s.lr, decay=0.0)  #1e-3)
    rnn.compile(loss='mae', optimizer=opt)

    # prepare dataset as pyBrain sequential dataset
    sequence = readDataSet(s.dataSet, s.dataSetDetailed, s)
    if s.limit_to:
        sequence = sequence[:s.limit_to]

    dp = DataProcessor()
    # standardize data by subtracting mean and dividing by std
    (meanSeq, stdSeq) = dp.normalize('data', sequence, s.nTrain)

    #dp.windowed_normalize(sequence)

    for key in sequence.keys():
        if key != "data":
            dp.normalize(key, sequence, s.nTrain)

    if s.dataSet in differenceSets:
        predictedInputNodiff = np.zeros((len(sequence), ))
        targetInputNodiff = np.zeros((len(sequence), ))

    if s.dataSet in differenceSets:
        backup_sequence = sequence
        sequence = dp.difference(sequence, s.lookback)

    seq_full = sequence['data'].values
    seq_actual = seq_full[s.front_buffer:]
    allX = getX(seq_full, s)
    allY = seq_actual[s.predictionStep - 1:]
    predictedInput = np.full((len(allY), ), np.nan)

    #if s.dataSet not in x_cols:
    #    allY = allY[s.lookback:]
    trainX = allX[:s.nTrain]
    trainY = allY[:s.nTrain]
    trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
    rnn.fit(trainX,
            trainY,
            epochs=s.epochs,
            batch_size=s.batch_size,
            verbose=min(s.max_verbosity, 2))
    #for i in xrange(0,s.nTrain):
    #    targetInput[i] = allY[i+s.predictionStep]
    targetInput = allY
    pred_diffs = []
    pred_closer_to_actual = []
    isFirst = True
    for i in tqdm(xrange(s.nTrain + s.predictionStep, len(allX)),
                  disable=s.max_verbosity == 0):
        #for i in tqdm(xrange(0, len(allX)), disable=s.max_verbosity == 0):
        if i % s.retrain_interval == 0 and i > s.numLags + s.nTrain and s.online:
            trainX = allX[i - s.nTrain - s.predictionStep:i - s.predictionStep]
            trainY = allY[i - s.nTrain - s.predictionStep:i - s.predictionStep]
            trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
            rnn.fit(trainX,
                    trainY,
                    epochs=s.epochs,
                    batch_size=s.batch_size,
                    verbose=0)

        #targetInput[i] = allY[i]
        predictedInput[i] = rnn.predict(np.reshape(allX[i], (1, 1, x_dims)))
        if isFirst:
            print predictedInput[i]
            isFirst = False
        #predictedInput[i] = targetInput[i-1440]
        pred_diffs.append(abs(predictedInput[i] - allX[i][-1]))
        pred_closer_to_actual.append(
            abs(predictedInput[i] - targetInput[i]) < abs(predictedInput[i] -
                                                          allX[i][-1]))

        if s.dataSet in differenceSets:
            predictedInputNodiff[i] = predictedInput[i]
            targetInputNodiff[i] = targetInput[i]
            predictedInput[i] = dp.inverse_difference(backup_sequence['data'],
                                                      predictedInput[i], i - 1)
            targetInput[i] = dp.inverse_difference(backup_sequence['data'],
                                                   targetInput[i], i - 1)
    for i in range(s.nTrain + s.predictionStep):
        predictedInput[i] = np.nan
    predictedInput = dp.denormalize(predictedInput, meanSeq, stdSeq)
    targetInput = dp.denormalize(targetInput, meanSeq, stdSeq)
    #dp.windowed_denormalize(predictedInput, targetInput)
    print "FINAL", predictedInput[-1], targetInput[-1], len(
        predictedInput), len(targetInput)
    if s.dataSet in differenceSets:

        # predictedInputNodiff = dp.denormalize(predictedInputNodiff)
        # targetInputNodiff = dp.denormalize(targetInputNodiff)
        pass
    dp.saveResultToFile(s.dataSet, predictedInput, targetInput, 'gru',
                        s.predictionStep, s.max_verbosity)
    skipTrain = error_ignore_first[s.dataSet]
    from plot import computeSquareDeviation
    squareDeviation = computeSquareDeviation(predictedInput, targetInput)
    squareDeviation[:skipTrain] = None
    nrmse = np.sqrt(np.nanmean(squareDeviation)) / np.nanstd(targetInput)
    if s.max_verbosity > 0:
        print "", s.nodes, "NRMSE {}".format(nrmse)
    mae = np.nanmean(np.abs(targetInput - predictedInput))
    if s.max_verbosity > 0:
        print "MAE {}".format(mae)
    mase = errors.get_mase(predictedInput, targetInput,
                           np.roll(targetInput, s.season))
    if s.max_verbosity > 0:
        print "MASE {}".format(mase)
    if s.dataSet in differenceSets:
        dp.saveResultToFile(s.dataSet, predictedInputNodiff, targetInputNodiff,
                            'gru_nodiff', s.predictionStep, s.max_verbosity)
        squareDeviation = computeSquareDeviation(predictedInputNodiff,
                                                 targetInputNodiff)
        squareDeviation[:skipTrain] = None
        nrmse = np.sqrt(
            np.nanmean(squareDeviation)) / np.nanstd(targetInputNodiff)
        if s.max_verbosity > 0:
            print "", s.nodes, "NRMSE {}".format(nrmse)
        mae = np.nanmean(np.abs(targetInputNodiff - predictedInputNodiff))
        if s.max_verbosity > 0:
            print "MAE {}".format(mae)
    closer_rate = pred_closer_to_actual.count(True) / float(
        len(pred_closer_to_actual))
    if s.max_verbosity > 0:
        pred_diffs.sort()
        print pred_diffs[0], pred_diffs[-1], pred_diffs[int(0.9 *
                                                            len(pred_diffs))]
        print "Good results:", closer_rate
    return mase, closer_rate
Ejemplo n.º 21
0
 def __init__(self):
     self.engine = Engine()
     self.processor = DataProcessor()
    sensor = model._getSensorRegion()
    encoderList = sensor.getSelf().encoder.getEncoderList()
    if sensor.getSelf().disabledEncoder is not None:
        classifier_encoder = sensor.getSelf().disabledEncoder.getEncoderList()
        classifier_encoder = classifier_encoder[0]
    else:
        classifier_encoder = None

    print "Load dataset: ", dataSet
    skips = data_skips[dataSet]
    df = pd.read_csv(inputData, header=0, skiprows=skips)
    df = preprocess(df)
    if limit_to:
        df = df[:limit_to]

    dp = DataProcessor()
    #dp.windowed_normalize(df, field_name=predictedField, is_data_field=True)

    print " run SP through the first %i samples %i passes " % (nMultiplePass,
                                                               nTrain)
    model = runMultiplePassSPonly(df, model, nMultiplePass, nTrain)
    model._spLearningEnabled = False

    maxBucket = classifier_encoder.n - classifier_encoder.w + 1
    likelihoodsVecAll = np.zeros((maxBucket, len(df)))

    prediction_nstep = None
    time_step = []
    actual_data = []
    patternNZ_track = []
    predict_data = np.zeros((_options.stepsAhead, 0))
Ejemplo n.º 23
0
class LightClient(object):
    def __init__(self, ip):
        self.log = Logger(MAIN_CLIENT_LOG_FILE, D_VERB)
        self.log.info('[MAIN THREAD] Instantiated client')
        self.receiving = False
        self.define_headers()
        self.targets = {}
        self.transmit = Queue.Queue()
        self.data_client = DataClient(self.transmit, ip)
        self.data_processor = DataProcessor(self.transmit, self.headers, self.targets)
        self.connect(ip)

    def connect(self, ip):
        self.soc_ctrl = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.soc_ctrl.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        my_ip = socket.gethostbyname('')
        self.log.debug('[MAIN THREAD] connecting...')
        self.soc_ctrl.connect((ip,SOC_PORT_CTRL))
        self.log.info('[MAIN THREAD] Client connected to server')

    def disconnect(self):
        ### data processor should not be here
        self.data_processor.stop()
        self.soc_ctrl.close()

    def define_headers(self):
        head = {}
        head['process'] = PROC_CPU_DATA + PROC_MEM_DATA + TIMESTAMPS
        head['system']  = SYS_CPU_OTHER + LOAD_AVG + SYS_CPU_DATA + SYS_MEM_DATA + TIMESTAMPS
        self.headers = head

    def add_target(self, target, name):
        if target in self.targets:
            self.targets[target].append(name)
        else:
            self.targets[target]=[name]

    def remove_target(self, target, name):
       if target in self.targets:
           if name in self.targets[target]:
               self.targets[target].remove(name)
               self.log.info('[MAIN THREAD] Removed {} named {}'.format(target, name))
           else:
               self.log.error('[MAIN THREAD] Asked to remove {} named {} while not recorded'.format(target, name))
       else:
           self.log.error('[MAIN THREAD] Asked to remove {} named {} while not recorded'.format(target, name))

    def start_record(self, target, name):
        self.log.debug('[MAIN THREAD] Asking server to start recording')
        msg = MSG_SEP.join([START_RECORD, target, name])
        answer = send_data(self.soc_ctrl,msg)
        self.log.info('[MAIN THREAD] Server asked to start recording')
        if answer == SYNC:
            self.add_target(target, name)
            self.log.info('[MAIN THREAD] Added {} named {}'.format(target, name))
        else:
            self.log.warn('[MAIN THREAD] Could not add {} named {} because of server answer'.format(target, name))

    def stop_record(self, target, name):
        self.log.debug('[MAIN THREAD] Asking server to stop recording')
        msg = MSG_SEP.join([STOP_RECORD, target, name])
        answer = send_data(self.soc_ctrl,msg)
        self.log.info('[MAIN THREAD] Server asked to stop recording {}'.format(name))
        if answer == SYNC:
            self.remove_target(target, name)
        else:
            self.log.warn('[MAIN THREAD] Could not remove {} named {} because of server answer'.format(target, name))

    def start_receive(self):
        if not self.receiving:
            self.receiving = True
            self.log.debug('[MAIN THREAD] Asking server to start sending')
            status = send_data(self.soc_ctrl,START_SEND)
            self.log.info('[MAIN THREAD] Server asked to start sending')
            if status == FAIL:
                self.log.error('[MAIN THREAD] Client tried to receive but server denied it')
            else:
                print status
                self.data_client.start()
                self.log.info('[MAIN THREAD] Client is receiving')
            self.log.debug("[MAIN THREAD] DATA THREAD started")
        else:
            self.log.warn("[MAIN THREAD] Asked to start receiving while already receiving")

    def stop_receive(self):
        if self.receiving:
            self.log.debug('[MAIN THREAD] Closing data channel. Exiting data client thread')
            self.data_client.stop()
            self.log.info("[MAIN THREAD] Asked server to stop receiving")
            self.receiving = False
            send_data(self.soc_ctrl,STOP_SEND)
        else:
            self.log.warn("[MAIN THREAD] Asked to stop receiving while already receiving")

    def start_store(self, dirname = 'easy_client'):
        return self.data_processor.start_store(dirname)

    def stop_store(self):
        self.data_processor.stop_store()

    def start_print(self):
        self.data_processor.start_print()

    def stop_print(self):
        self.printing = self.data_processor.stop_print()

    def stop_process(self):
        self.stop_print()
        self.stop_store()
        self.data_processor.stop()
        self.stop_receive()
        self.soc_ctrl.close()


    def stop_all(self):
        self.stop_process()
        send_data(self.soc_ctrl, STOP_ALL)
Ejemplo n.º 24
0
      result["conf_matrices"].append(
        confusion_matrix(
          self.model.predict(test_vectors),
          test_targets
      ))

      result["roc_curves"].append(
        roc_curve(
          test_targets, 
          self.model.predict_proba(test_vectors)[:,1])
      )

      result["roc_auc_scores"].append(
        roc_auc_score(
          test_targets, 
          self.model.predict(test_vectors))
      )

      result["coefficients"].append(
        self.model.coef_[0]
      )

    return result
        
if __name__ == "__main__":
  dp = DataProcessor()
  dp.load('data/SQuAD/squad-v7.file')
  model = LogRegModel()
  model.load_vectors(dp.articles)
  model.run_k_fold()
Ejemplo n.º 25
0
class LightClient(object):
    def __init__(self, ip):
        self.log = Logger(MAIN_CLIENT_LOG_FILE, D_VERB)
        self.log.info('[MAIN THREAD] Instantiated client')
        self.receiving = False
        self.define_headers()
        self.targets = {}
        self.transmit = Queue.Queue()
        self.data_client = DataClient(self.transmit, ip)
        self.data_processor = DataProcessor(self.transmit, self.headers,
                                            self.targets)
        self.connect(ip)

    def connect(self, ip):
        self.soc_ctrl = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.soc_ctrl.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        my_ip = socket.gethostbyname('')
        self.log.debug('[MAIN THREAD] connecting...')
        self.soc_ctrl.connect((ip, SOC_PORT_CTRL))
        self.log.info('[MAIN THREAD] Client connected to server')

    def disconnect(self):
        ### data processor should not be here
        self.data_processor.stop()
        self.soc_ctrl.close()

    def define_headers(self):
        head = {}
        head['process'] = PROC_CPU_DATA + PROC_MEM_DATA + TIMESTAMPS
        head[
            'system'] = SYS_CPU_OTHER + LOAD_AVG + SYS_CPU_DATA + SYS_MEM_DATA + TIMESTAMPS
        self.headers = head

    def add_target(self, target, name):
        if target in self.targets:
            self.targets[target].append(name)
        else:
            self.targets[target] = [name]

    def remove_target(self, target, name):
        if target in self.targets:
            if name in self.targets[target]:
                self.targets[target].remove(name)
                self.log.info('[MAIN THREAD] Removed {} named {}'.format(
                    target, name))
            else:
                self.log.error(
                    '[MAIN THREAD] Asked to remove {} named {} while not recorded'
                    .format(target, name))
        else:
            self.log.error(
                '[MAIN THREAD] Asked to remove {} named {} while not recorded'.
                format(target, name))

    def start_record(self, target, name):
        self.log.debug('[MAIN THREAD] Asking server to start recording')
        msg = MSG_SEP.join([START_RECORD, target, name])
        answer = send_data(self.soc_ctrl, msg)
        self.log.info('[MAIN THREAD] Server asked to start recording')
        if answer == SYNC:
            self.add_target(target, name)
            self.log.info('[MAIN THREAD] Added {} named {}'.format(
                target, name))
        else:
            self.log.warn(
                '[MAIN THREAD] Could not add {} named {} because of server answer'
                .format(target, name))

    def stop_record(self, target, name):
        self.log.debug('[MAIN THREAD] Asking server to stop recording')
        msg = MSG_SEP.join([STOP_RECORD, target, name])
        answer = send_data(self.soc_ctrl, msg)
        self.log.info(
            '[MAIN THREAD] Server asked to stop recording {}'.format(name))
        if answer == SYNC:
            self.remove_target(target, name)
        else:
            self.log.warn(
                '[MAIN THREAD] Could not remove {} named {} because of server answer'
                .format(target, name))

    def start_receive(self):
        if not self.receiving:
            self.receiving = True
            self.log.debug('[MAIN THREAD] Asking server to start sending')
            status = send_data(self.soc_ctrl, START_SEND)
            self.log.info('[MAIN THREAD] Server asked to start sending')
            if status == FAIL:
                self.log.error(
                    '[MAIN THREAD] Client tried to receive but server denied it'
                )
            else:
                print status
                self.data_client.start()
                self.log.info('[MAIN THREAD] Client is receiving')
            self.log.debug("[MAIN THREAD] DATA THREAD started")
        else:
            self.log.warn(
                "[MAIN THREAD] Asked to start receiving while already receiving"
            )

    def stop_receive(self):
        if self.receiving:
            self.log.debug(
                '[MAIN THREAD] Closing data channel. Exiting data client thread'
            )
            self.data_client.stop()
            self.log.info("[MAIN THREAD] Asked server to stop receiving")
            self.receiving = False
            send_data(self.soc_ctrl, STOP_SEND)
        else:
            self.log.warn(
                "[MAIN THREAD] Asked to stop receiving while already receiving"
            )

    def start_store(self, dirname='easy_client'):
        return self.data_processor.start_store(dirname)

    def stop_store(self):
        self.data_processor.stop_store()

    def start_print(self):
        self.data_processor.start_print()

    def stop_print(self):
        self.printing = self.data_processor.stop_print()

    def stop_process(self):
        self.stop_print()
        self.stop_store()
        self.data_processor.stop()
        self.stop_receive()
        self.soc_ctrl.close()

    def stop_all(self):
        self.stop_process()
        send_data(self.soc_ctrl, STOP_ALL)
Ejemplo n.º 26
0
def train_and_validate(alarm, ticket):
    '''
    Function will perform the following actions:

    1. Data will be converted to numpy arrays to be accepted as
       input to the learning model
    2. Construct learning model and train on the data provided
    3. Generate data predictions using a subset of the ticket data
    4. Validate the predictions generated with the test data against the
       manually predicted values in the ticket file

            Parameters:
                alarm: alarm file needed to initialize DataProcessor
                ticket: ticket file needed to intialize DataProcessor
    '''
    # Initialize DataProcessor object
    dp = DataProcessor(alarm, ticket)

    # Create thread to run progress bar
    thread = threading.Thread(target=run_progress_bar, args=(670, ))

    # Start thread to run progress bar
    thread.start()
    '''
    Converting lists to numpy array, the number specifies the index in the
    array of associated label values
    '''
    encoded_hex_codes = dp.convert_array_to_np_array(
        dp.encode_ticket_hex_codes())
    event_cause_options = dp.convert_array_to_np_array(
        dp.get_encoded_label_value(dp.event_cause_vals, 0))
    detection_method_options = dp.convert_array_to_np_array(
        dp.get_encoded_label_value(dp.detection_method, 1))
    restore_method_options = dp.convert_array_to_np_array(
        dp.get_encoded_label_value(dp.restore_method, 2))
    fix_classification_options = dp.convert_array_to_np_array(
        dp.get_encoded_label_value(dp.fix_classification, 3))
    subsystem_options = dp.convert_array_to_np_array(
        dp.get_encoded_label_value(dp.subsystem, 4))
    relevance_options = dp.convert_array_to_np_array(
        dp.get_encoded_label_value(dp.relevance, 5))

    # Train on Data. Training & saving the model for each of the labels
    classify_data(encoded_hex_codes, event_cause_options, 'event_cause.hdf5',
                  101, 110, 0.80, len(dp.event_cause_vals))

    classify_data(encoded_hex_codes, detection_method_options,
                  'detection_method.hdf5', 101, 110, 0.80,
                  len(dp.detection_method))

    classify_data(encoded_hex_codes, restore_method_options,
                  'restore_method.hdf5', 101, 110, 0.80,
                  len(dp.restore_method))

    classify_data(encoded_hex_codes, fix_classification_options,
                  'fix_classification.hdf5', 101, 110, 0.80,
                  len(dp.fix_classification))

    classify_data(encoded_hex_codes, subsystem_options, 'subsystem.hdf5', 101,
                  110, 0.80, len(dp.subsystem))

    classify_data(encoded_hex_codes, relevance_options, 'relevance.hdf5', 101,
                  110, 0.80, len(dp.relevance))
    '''
    Generate Training Data Predictions

    20 percent that needs to be tested for alarm hex and all labels
    Calculate the starting place by multiplying length of encoded_hex_codes
    which is the input for the model
    '''

    start_index = int(len(encoded_hex_codes) * 0.8)

    predict_input_hex = encoded_hex_codes[start_index:]
    predict_event_cause = event_cause_options[start_index:]
    predict_detection_method = detection_method_options[start_index:]
    predict_restore_method = restore_method_options[start_index:]
    predict_fix_classification = fix_classification_options[start_index:]
    predict_subsystem = subsystem_options[start_index:]
    predict_relevance = relevance_options[start_index:]

    # calling prediction from predict.py and returns array of confidence values
    event_cause_prediction = prediction(predict_input_hex, 'event_cause.hdf5')

    detection_method_prediction = prediction(predict_input_hex,
                                             'detection_method.hdf5')

    restore_method_prediction = prediction(predict_input_hex,
                                           'restore_method.hdf5')

    fix_classification_prediction = prediction(predict_input_hex,
                                               'fix_classification.hdf5')

    subsystem_prediction = prediction(predict_input_hex, 'subsystem.hdf5')

    relevance_prediction = prediction(predict_input_hex, 'relevance.hdf5')
    '''
    Validate Training Data Predictions
    Validation calls for all labels using the prediction function returns
    '''

    validation(predict_event_cause, event_cause_prediction, predict_input_hex,
               'event_cause_predictions.txt')

    validation(predict_detection_method, detection_method_prediction,
               predict_input_hex, 'detection_method_predictions.txt')

    validation(predict_restore_method, restore_method_prediction,
               predict_input_hex, 'restore_method_predictions.txt')

    validation(predict_fix_classification, fix_classification_prediction,
               predict_input_hex, 'fix_classication_predictions.txt')

    validation(predict_subsystem, subsystem_prediction, predict_input_hex,
               'subsystem_predictions.txt')

    validation(predict_relevance, relevance_prediction, predict_input_hex,
               'relevance_predictions.txt')

    # Join thread back to main process
    thread.join()
def run_gru(s):
    prob = tf.placeholder_with_default(1.0, shape=())

    global global_step
    global increment_global_step_op
    global reset_global_step_op
    global batches
    global images_placeholder
    global batches_op
    global_step = tf.Variable(0, name='global_step', trainable=False, dtype=tf.int32)
    increment_global_step_op = tf.assign(global_step, global_step + 1)
    reset_global_step_op = tf.assign(global_step, 0)

    batches = tf.get_variable("batches", [s.nTrain / int(s.batch_size), s.batch_size, 1, 1], dtype=tf.float32,
                              initializer=tf.zeros_initializer)
    images_placeholder = tf.placeholder(tf.float32, shape=(s.nTrain / int(s.batch_size), s.batch_size, 1, 1))
    batches_op = tf.assign(batches, images_placeholder)


    x_dims = s.lookback
    random.seed(6)
    np.random.seed(6)
    tf.set_random_seed(6)
    if s.implementation == "keras":
        if s.use_binary:
            raise Exception("Binary Keras not implemented")
        rnn = Sequential()
        if s.rnn_type == "lstm":
            rnn.add(LSTM(s.nodes, input_shape=(None,x_dims), kernel_initializer='he_uniform'))
        elif s.rnn_type == "gru":
            rnn.add(GRU(s.nodes, input_shape=(None, x_dims), kernel_initializer='he_uniform'))

        rnn.add(Dropout(0.5))
        rnn.add(Dense(1, kernel_initializer='he_uniform'))
        opt = rmsprop(lr=s.lr)#1e-3)
        rnn.compile(loss='mae', optimizer=opt)

        input = Input(shape=(1, x_dims))
        dense1 = Dense(s.nodes, activation='sigmoid')(input)
        dense2 = Dense(s.nodes, activation='sigmoid')(input)
        dense3 = Dense(s.nodes, activation='tanh')(input)
        mult1 = Multiply()([dense2, dense3])
        act1 = Activation('tanh')(mult1)
        mult2 = Multiply()([dense1, act1])
        reshape = Reshape((s.nodes,))(mult2)
        dropout = Dropout(0.5)(reshape)
        dense_out = Dense(1)(dropout)
        rnn = Model(inputs=[input], outputs=[dense_out])
        opt = adam(lr=s.lr)  # 1e-3)
        rnn.compile(loss='mae', optimizer=opt)
        print rnn.summary()


    elif s.implementation == "tf":
        data = tf.placeholder(tf.float32, [None, s.lookback,  1])  # Number of examples, number of input, dimension of each input
        target = tf.placeholder(tf.float32, [None, 1])
        if s.rnn_type == "lstm" and s.use_binary:
            cell = rnn_tf.LSTMCell(s.nodes)

        elif s.rnn_type == "lstm" and not s.use_binary:
            cell = tf.nn.rnn_cell.LSTMCell(s.nodes)
        elif s.rnn_type == "gru" and s.use_binary:
            cell = rnn_tf.GRUCell(s.nodes)
        elif s.rnn_type == "gru" and not s.use_binary:
            cell = tf.nn.rnn_cell.GRUCell(s.nodes)


        val, _ = tf.nn.dynamic_rnn(cell, data, dtype=tf.float32)
        with tf.name_scope('rnn_summaries'):
            var = val
            mean = tf.reduce_mean(var)
            tf.summary.scalar('mean', mean)
            with tf.name_scope('stddev'):
                stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
            tf.summary.scalar('stddev', stddev)
            tf.summary.scalar('max', tf.reduce_max(var))
            tf.summary.scalar('min', tf.reduce_min(var))
            tf.summary.histogram('histogram', var)
        val = tf.nn.dropout(val, prob)
        if not s.use_binary:
            dense = tf.layers.dense(val, 1)
        else:
            dense = core_discretize.dense(val, 1)
        with tf.name_scope('dense_summaries'):
            var = dense
            mean = tf.reduce_mean(var)
            tf.summary.scalar('mean', mean)
            with tf.name_scope('stddev'):
                stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
            tf.summary.scalar('stddev', stddev)
            tf.summary.scalar('max', tf.reduce_max(var))
            tf.summary.scalar('min', tf.reduce_min(var))
            tf.summary.histogram('histogram', var)
        pred = tf.reshape(dense, (tf.shape(dense)[0], 1))
        summary = tf.summary.merge_all()
        optimizer = tf.train.AdamOptimizer(learning_rate=s.lr)
        #cost = tf.losses.mean_squared_error(target, pred)
        cost = tf.reduce_mean(tf.abs(target - pred))
        minimize = optimizer.minimize(cost)

    else:
        raise Exception("Unknown implementation " + s.implementation)


    sequence = readDataSet(s.dataSet, s.dataSetDetailed, s)
    if s.limit_to:
        sequence = sequence[:s.limit_to]

    #TEMP SANITY CHECK
    # sequence['data'][7001] = 0
    # sequence['data'][7002] = 0
    # sequence['data'][7003] = 0
    # sequence['data'][7004] = 0
    # sequence['data'][7005] = 0
    seq_full = sequence['data'].values #use .values to copy

    targetInput = seq_full[s.front_buffer + s.predictionStep - 1:].copy() #grab this now to avoid having to denormalize


    dp = DataProcessor()
    if s.normalization_type == 'default':
        (meanSeq, stdSeq) = dp.normalize('data', sequence, s.nTrain)
    elif s.normalization_type == 'windowed':
        dp.windowed_normalize(sequence)
    elif s.normalization_type == 'AN':
        an = AdaptiveNormalizer(s.lookback, s.lookback + s.predictionStep)
        an.set_pruning(False)
        an.set_source_data(seq_full, s.nTrain)
        an.do_ma('s')
        an.do_stationary()
        an.remove_outliers()
        seq_norm = an.do_adaptive_normalize()
    else:
        raise Exception("Unsupported normalization type: " + s.normalization_type)

    seq_actual = seq_full[s.front_buffer:] #Leave enough headroom for MASE calculation and lookback

    seq_full_norm = sequence['data'].values
    seq_actual_norm = seq_full_norm[s.front_buffer:]

    if s.normalization_type != "AN":
        #Default and windowed change the seq itself but still require creating lookback frames
        allX = getX(seq_full_norm, s)
        allY = seq_actual_norm[s.predictionStep-1:]
    else:
        #AN creates a new array but takes care of lookback internally
        allX= seq_norm[:,0:-s.predictionStep]
        allY = np.reshape(seq_norm[:,-1], (-1,))
        # TODO FIX PROPERLY (now rolled too far)
        too_long = len(allX) - (len(seq_full) - s.front_buffer - s.predictionStep + 1)
        if too_long > 0:
            allX = allX[too_long:]
            allY = allY[too_long:]

    print len(allX), len(allY), s.front_buffer
    predictedInput = np.full((len(allY),), np.nan) #Initialize all predictions to NaN

    trainX = allX[:s.nTrain]
    trainY = allY[:s.nTrain]
    trainX = np.reshape(trainX, (trainX.shape[0],1,  trainX.shape[1]))
    trainY = np.reshape(trainY, ( trainY.shape[0],))
    if s.implementation == "keras":
        rnn.fit(trainX, trainY, epochs=s.epochs, batch_size=s.batch_size, verbose=min(s.max_verbosity, 2))
    elif s.implementation == "tf":
        sess = tf.Session()
        writer = tf.summary.FileWriter("results/", graph=sess.graph)
        init = tf.global_variables_initializer()
        sess.run(init)

        for v in tf.trainable_variables():
            print v.name
        for epoch in tqdm(range(s.epochs)):
            the_cost, _, summ = sess.run([cost, minimize, summary], feed_dict={data: trainX, target: trainY, prob: 0.5})
            writer.add_summary(summ, epoch)
            if epoch % 10 == 0:
                print the_cost
            #print(psutil.Process(os.getpid()).memory_percent())
            var = [v for v in tf.trainable_variables() if v.name == "rnn/gru_cell/gates/kernel:0"][0]
            print sess.run(tf.reduce_min(var))
            print sess.run(tf.reduce_max(var))
            # var = [v for v in tf.trainable_variables() if v.name == "rnn/gru_cell/gates/bias:0"][0]
            # print sess.run(tf.reduce_min(var))
            # print sess.run(tf.reduce_max(var))
            # var = [v for v in tf.trainable_variables() if v.name == "rnn/gru_cell/candidate/kernel:0"][0]
            # print sess.run(tf.reduce_min(var))
            # print sess.run(tf.reduce_max(var))
            # var = [v for v in tf.trainable_variables() if v.name == "rnn/gru_cell/candidate/bias:0"][0]
            # print sess.run(tf.reduce_min(var))
            # print sess.run(tf.reduce_max(var))
            # print "loop"
        var = [v for v in tf.trainable_variables() if v.name == "dense/bias:0"]
        print sess.run(var)

    minval = 10
    latestStart = None
    for i in tqdm(xrange(s.nTrain + s.predictionStep, len(allX)), disable=s.max_verbosity == 0):
    #for i in tqdm(xrange(0, len(allX)), disable=s.max_verbosity == 0):
    #for i in tqdm(xrange(10475, len(allX)), disable=s.max_verbosity == 0):
        if i % s.retrain_interval == 0 and i > s.numLags+s.nTrain and s.online:
            if s.normalization_type == 'AN':

                predictedInput = np.array(an.do_adaptive_denormalize(predictedInput, therange=(i-s.retrain_interval, i)))
                latestStart = i
                an.set_ignore_first_n(i-s.nTrain-s.predictionStep)
                an.do_ma('s')
                an.do_stationary()
                an.remove_outliers()
                seq_norm = an.do_adaptive_normalize()

                allX = seq_norm[:, 0:-s.predictionStep]
                allY = np.reshape(seq_norm[:, -1], (-1,))

            trainX = allX[i-s.nTrain-s.predictionStep:i-s.predictionStep]
            trainY = allY[i-s.nTrain-s.predictionStep:i-s.predictionStep]
            trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
            trainY = np.reshape(trainY, (trainY.shape[0], 1))
            if s.implementation == "keras":
                rnn.fit(trainX, trainY, epochs=s.epochs, batch_size=s.batch_size, verbose=0)
            elif s.implementation == "tf":
                for epoch in range(s.epochs):
                    sess.run(minimize, feed_dict={data: trainX, target: trainY, prob: 0.5})


        if s.implementation == "keras":
            predictedInput[i] = rnn.predict(np.reshape(allX[i], (1,1,x_dims)))

        elif s.implementation == "tf":
            predictedInput[i] = sess.run(dense, feed_dict={data: np.reshape(allX[i], (1, x_dims, 1))})
            #if len(allX) > i+5:
            #    predictedInput[i] = allY[i-3000]

        # if i == 10000:
        #     print allX[i]
        #     print "should be ", (targetInput[i] - meanSeq) / stdSeq
        #     print "predicted as ", predictedInput[i]

    # for i in range(s.nTrain + s.predictionStep):
    #     predictedInput[i] = np.nan
    print "SMALLEST", minval
    # np.set_printoptions(threshold=np.nan, suppress=True)
    # print "ALLY START"
    # for val in allY:
    #     print val
    # print "ALLY STOP"

    if s.normalization_type == 'default':
        predictedInput = dp.denormalize(predictedInput, meanSeq, stdSeq)
        #targetInput = dp.denormalize(targetInput, meanSeq, stdSeq)
    elif s.normalization_type == 'windowed':
        dp.windowed_denormalize(predictedInput, targetInput,  pred_step=s.predictionStep)
    elif s.normalization_type == 'AN':
        if latestStart:
            predictedInput = np.array(an.do_adaptive_denormalize(predictedInput, therange=(latestStart, len(predictedInput))))
        else:
            predictedInput = np.array(an.do_adaptive_denormalize(predictedInput))
        if an.pruning:
            targetInput = np.delete(targetInput, an.deletes)
    print len(predictedInput), len(targetInput), "LENS"
    #TEMP SANITY CHECK
    #print predictedInput[7005 - s.front_buffer - s.predictionStep +1]
    #print predictedInput[7006 - s.front_buffer - s.predictionStep + 1]
    dp.saveResultToFile(s.dataSet, predictedInput, targetInput, 'gru', s.predictionStep, s.max_verbosity)
    skipTrain = s.ignore_for_error
    from plot import computeSquareDeviation
    squareDeviation = computeSquareDeviation(predictedInput, targetInput)
    squareDeviation[:skipTrain] = None
    nrmse = np.sqrt(np.nanmean(squareDeviation)) / np.nanstd(targetInput)
    if s.max_verbosity > 0:
        print "", s.nodes, "NRMSE {}".format(nrmse)
    mae = np.nanmean(np.abs(targetInput-predictedInput))
    if s.max_verbosity > 0:
        print "MAE {}".format(mae)
    mape = errors.get_mape(predictedInput,targetInput, s.ignore_for_error)
    if s.max_verbosity > 0:
            print "MAPE {}".format(mape)
    mase = errors.get_mase(predictedInput, targetInput, np.roll(targetInput, s.season), s.ignore_for_error)
    if s.max_verbosity > 0:
        print "MASE {}".format(mase)

    if s.implementation == "tf":
        sess.close()
    return mase
Ejemplo n.º 28
0
# from keras.models import Sequential # Note: included for debug source access
from tensorflow.keras.layers import Dense
# from keras.layers import Dense # Note: included for debug source access
import os
import numpy as np
import sys
from data_processing import DataProcessor
from matplotlib import pyplot

cur_path = os.path.dirname(__file__)
data_folder = "data\\titanic"
processed_data_folder = os.path.join(cur_path, data_folder)
# Note: Not using test.csv as it does not provide whether or not the passenger survived; therefore we cannot assess
#       how well the model performed.
data_file_path = os.path.join(processed_data_folder, "train.csv")
data_processor = DataProcessor(data_file_path, processed_data_folder, "ffnn_processed.npz")

# Load data
try:
    # Try to load data
    data_processor.load_processed_data()

except FileNotFoundError:
    # No data found, so process it
    # 20% test, 20% validation, 60% training samples from data
    splits = (0.2, 0.2, 0.6)
    # Only use certain columns
    use_cols = (  # 0, #PassengerID
                    1,  # Survived
                    2,  # Pclass
                    # 3, #Name
Ejemplo n.º 29
0
class DataCollector:
    """ For all data collection. """
    def __init__(self):
        self.engine = Engine()
        self.processor = DataProcessor()

    def get_single_box_score(self, game_id):
        """
        Gathers a box score from basketball-reference.com and stores to database.

        Keyword arguments:
        game_id: 12 character long string in form YYYYMMDD0XXX, where
            YYYY is the year
            MM is a 2 digit numeric representation of the month, with zero padding if necessary
            DD is a 2 digit numeric representation of the day, with zero padding if necessary
            XXX is the 3-character abbreviation of the home team,
                i.e. 'BOS' for Boston Celtics or 'NYK' for New York Knicks
        """

        url = BK_REF_URL + game_id + HTML_SUFFIX

        page_response = requests.get(url, headers=REQUEST_HEADERS)
        page_tree = html.fromstring(page_response.content)

        home_stats, away_stats = [], []

        for stat in [
                'pts', 'fg', 'fga', 'fg3', 'fg3a', 'ft', 'fta', 'orb', 'drb',
                'ast', 'stl', 'blk', 'tov'
        ]:
            away, home = page_tree.xpath(BK_REF_XPATH % stat)
            away_stats.append(int(away.text.strip()))
            home_stats.append(int(home.text.strip()))

        minutes = int(page_tree.xpath(BK_REF_XPATH % 'mp')[0].text.strip()) / 5

        data_values = tuple([None, None, game_id] + away_stats + home_stats +
                            [minutes])

        self.engine.insert_box_score(data_values)

    def get_season_schedule(self, year):
        """
        Gathers a full season's game schedule by traversing basketball-reference.com
        These will eventually be used by the get_single_box_score method to gather box scores

        Keywords arguments:
        year: int representing the year that a season concludes in i.e. 1986-1987 season is represented by 1987
        """
        schedule = []
        for month in [
                'october', 'november', 'december', 'january', 'february',
                'march', 'april'
        ]:
            url = BK_REF_SCHEDULE_URL % (str(year), month)

            page_response = requests.get(url, headers=REQUEST_HEADERS)

            if page_response.status_code == 404:
                continue

            page_tree = html.fromstring(page_response.content)

            game_headers = page_tree.xpath(BK_REF_SCHEDULE_XPATH + 'th')
            away_xpath = 'td[1]/a' if int(year) <= 2000 else 'td[2]/a'
            away_teams = page_tree.xpath(BK_REF_SCHEDULE_XPATH + away_xpath)

            # handle special case for april, where playoff games are displayed on the page
            if month == 'april':
                header_list = page_tree.xpath(BK_REF_SCHEDULE_XPATH + 'th')
                try:
                    end_index = next(index
                                     for index, val in enumerate(header_list)
                                     if not val.get('class', False))
                except StopIteration:
                    end_index = len(game_headers)
            else:
                end_index = len(game_headers)

            for index, game in enumerate(game_headers):
                if index == end_index:
                    break
                game_code = game.attrib['csk']
                away_url = away_teams[index].attrib['href']
                away_team = away_url.split('/')[2]
                home_team = game_code[-3:]
                game_date = '{}-{}-{}'.format(game_code[:4], game_code[4:6],
                                              game_code[6:8])
                schedule.append(
                    (game_code, game_date, year, away_team, home_team))

        self.engine.insert_scheduled_games(schedule)
        self.engine.commit_changes()

    def gather_all_scheduled_games(self):
        """
        simple loop to gather all games from 1986 to the present
        """
        print "Loading each season schedule and saving to database:"
        for season in SEASON_LIST:
            print season
            self.get_season_schedule(season)

    def gather_all_box_scores(self):
        """
        Gather all games on schedule. Save after each because this will likely be interrupted at some point
        """
        games = self.engine.get_game_ids_to_gather()
        for game_id in games:
            print game_id
            self.get_single_box_score(game_id)
            self.engine.commit_changes()

    def fill_database_from_scratch(self):
        """ Starting with model but no records, fill in the database """
        # start by loading teams table in
        self.engine.insert_all_team_data()
        # # gather and save all scheduled games into db
        self.gather_all_scheduled_games()
        # gather and save all box scores into db
        self.gather_all_box_scores()
        # calculate all team stats from box scores
        self.processor.complete_database_setup()

    def in_season_update(self):
        self.gather_all_box_scores()
        self.processor.process_all_stats_for_year(CURRENT_SEASON)