Exemple #1
0
    def initial_train(self):
        log.info('Performing initial training of the model.')
        client = mongoUtil.get_client()
        db = client.otc
        train_data = db.train_data
        if_trained = False
        log.info('Established connection to Mongo Database.')
        old_data_x_frame = pd.DataFrame(list(train_data.find()))
        if not old_data_x_frame.empty:
            if_trained = True
            log.info('Initial training sequence started.')
            old_data_y_frame = old_data_x_frame['Break Owner'].astype(str)
            old_data_x_frame.drop('_id', axis=1, inplace=True)
            old_data_x_frame.drop('timestamp', axis=1, inplace=True)
            old_data_x_frame.drop('Break Owner', axis=1, inplace=True)
            old_data_x_frame.drop('Break Reason', axis=1, inplace=True)
            self.training_data = old_data_x_frame
            train_x = self.one_hot(old_data_x_frame)
            self.training_frame = train_x.columns.values
            train_y = list(old_data_y_frame)
            self.current_report = self.execute_train_report(train_x, train_y)
        else:
            log.info(
                'Cannot find training data in Mongo Database. Abort initial training sequence.'
            )

        return if_trained
Exemple #2
0
def load_purging():
    if request.method == 'POST':
        mongoUtil.purge_model(session['start'], session['end'])
        #This line trains the model on the new dataset without the purged data.
        model.initial_train()
        log.info('Model training on purged model begun.')
        client = mongoUtil.get_client()
        db = client.otc
        train_data = db.train_data
        if train_data.count() == 0:
            model.current_report = None
            model.reset_model()
        log.info('Model training on purged model completed.')
        return redirect(url_for('purge_result'))
    return render_template('loading_purge.html')
Exemple #3
0
def purge_model():
    try:
        log.info('Model data is being purged.')
        #These lines get the training data from the database
        client = mongoUtil.get_client()
        db = client.otc
        train_data = db.train_data
        if request.method == 'POST':
            log.info('POST: waiting for purge time frame.')
            start_date = request.values["Start Date"]
            end_date = request.values["End Date"]
            #This handles the incorrect data error
            if (end_date < start_date) or ((start_date == "") or
                                           (end_date == "")):
                errorString = 'Invalid dates detected. Please select both start and end date.'
                log.error(errorString)
                session['errorOc'] = True
                session['errorInfo'] = errorString
                return redirect(url_for('purge_result'))
            else:
                #These lines parse the date strings to ISO date format
                start_date = datetime.strptime(start_date, '%Y-%m-%d')
                end_date = datetime.strptime(end_date, '%Y-%m-%d')
                session['start'] = start_date
                session['end'] = end_date
            log.info('Purge start date: ' + str(start_date) +
                     '. Purge end date: ' + str(end_date))
            #These checks if the data is all purged. If it is we do not need to train the model again
            if train_data.count(
                {"timestamp": {
                    '$gte': start_date,
                    '$lte': end_date
                }}) == 0:
                return redirect(url_for('purge_result'))
            if train_data.count() != 0:
                return redirect(url_for('load_purging'))
        return render_template('modify_model.html')
    except:
        errorString = 'Unknown error occurred.'
        log.error(errorString)
Exemple #4
0
    def flag(self, df):

        df_new = df
        df_new['FLAG'] = None

        client = mongoUtil.get_client()
        db = client.otc

        train_data = pd.DataFrame(list(db.train_data.find({})))

        d = {}
        for col in train_data.columns:
            d[col] = dict(enumerate(train_data[col].unique().flatten(), 1))

        # starttime = time()
        for index, row in df_new.iterrows():
            for col in train_data.columns.values:
                if (row[col]) not in set(d[col].values()) and col not in \
                        ['NOTIONAL', 'NOTIONAL_2', 'STRIKE_PRICE', 'DIFF_AGE', '_id', 'timestamp']:
                    df_new.loc[index, 'FLAG'] = 'x'
                    break

        return df_new
Exemple #5
0
    def re_train(self, filename):
        log.info('Training started.')
        start = time.time()
        # Grab new data from data folder
        csv_path = 'train_data/' + str(filename)
        df = pd.read_csv(csv_path)
        new_data_x_frame = df.drop('Break Owner', axis=1).drop('Break Reason',
                                                               axis=1)

        new_data_y_frame = df['Break Owner'].astype(str)
        new_data_x = new_data_x_frame.values
        new_data_y = df[['Break Owner']].values

        # Grab the original training data from the database
        log.info('Establishing connection to Mongo Database.')
        client = mongoUtil.get_client()
        db = client.otc
        train_data = db.train_data

        log.info('Connection established.')
        old_data_x_frame = pd.DataFrame(list(train_data.find()))
        # print old_data_x.columns
        classification_report_var = None
        return_json = None

        if not old_data_x_frame.empty:
            # old_data_x = self.one_hot(old_data_x_frame)
            log.info('Previous training data was found.')
            old_data_y_frame = old_data_x_frame['Break Owner'].astype(str)
            old_data_x_frame.drop('_id', axis=1, inplace=True)
            old_data_x_frame.drop('timestamp', axis=1, inplace=True)
            old_data_x_frame.drop('Break Owner', axis=1, inplace=True)
            old_data_x_frame.drop('Break Reason', axis=1, inplace=True)

            a = new_data_x_frame.columns
            b = old_data_x_frame.columns
            log.info('Performing data clean up.')
            # This set is the columns that the unanalyzed data doesn't have
            new_set = set(b) - set(a)
            # This set is the columns that the training data doesn't have
            new_set_2 = set(a) - set(b)

            # Remove the columns that the training data doesn't have
            for col in new_set_2:
                new_data_x_frame = new_data_x_frame.drop(col, axis=1)

            # Adding columns of 0 for the columns the unanalyzed data doesn't have but the training data does
            for col in new_set:
                new_data_x_frame[col] = 0
            train_x = [old_data_x_frame, new_data_x_frame]
            log.info('Merging previous data with new data.')
            # Merge them together

            train_x = pd.concat(train_x, ignore_index=True)
            # Grab the original target data from the database
            train_y = [old_data_y_frame, new_data_y_frame]

            # Merge them together
            train_y = pd.concat(train_y, ignore_index=True)
            train_y = list(train_y)

            self.training_data = train_x
            train_x = self.one_hot(train_x)
            self.training_frame = train_x.columns.values
            self.current_report = self.execute_train_report(train_x, train_y)
        else:
            log.info('No prevous training data found.')
            self.training_data = new_data_x_frame
            new_data_x_frame = self.one_hot(new_data_x_frame)
            self.training_frame = new_data_x_frame.columns.values

            new_data_x = new_data_x_frame.values
            self.current_report = self.execute_train_report(
                new_data_x,
                new_data_y,
                test_size=config.getfloat('Misc', 'TEST_SIZE_2'),
                random_state=config.getint('Misc', 'RSTATE_1'))
        end = time.time()
        log.info('Total training time: ' + str(end - start))
        return self.current_report