Exemple #1
0
def main():
    argparser = argparse.ArgumentParser()
    argparser.add_argument('--model_file', type=str)
    argparser.add_argument('--cpu', action='store_true')
    argparser.add_argument('--cuda', action='store_true')
    argparser.add_argument('--chunk_len', type=int, default=200)
    argparser.add_argument('--batch_size', type=int, default=300)
    argparser.add_argument('--num_workers', type=int, default=8)
    argparser.add_argument('filename', type=str)
    args = argparser.parse_args()

    if args.cpu:
        decoder = torch.load(args.model_file,
                             map_location=lambda storage, loc: storage)
    else:
        decoder = torch.load(args.model_file)

    dataset = WordDataset(args.filename, args.chunk_len)
    dataloader = DataLoader(dataset,
                            batch_size=args.batch_size,
                            num_workers=args.num_workers,
                            drop_last=True)

    criterion = nn.CrossEntropyLoss()

    loss, num_samples = 0, 0
    for sample in dataloader:
        input_, target = prep_data(sample['input'], sample['target'],
                                   args.cuda)
        loss += evaluate(decoder, criterion, input_, target, args.batch_size,
                         args.chunk_len, args.cuda)
        num_samples += 1
    loss /= num_samples

    print('Loss (BPC): {:.2f}'.format(loss))
Exemple #2
0
def predict(player_id):
    X = fetch_player_data(player_id)
    date = X['date'].max()
    rolling = prep_data(X)
    goals = round(pipe.predict(rolling)[0], 2)
    df = pd.DataFrame({
        'date_created': pd.Timestamp('now'),
        'player_id': [player_id],
        'last_game': [date],
        'goals': [goals]
    })
    df.to_sql('predictions', con, if_exists='append', index=False)
    print('Success!')
Exemple #3
0
def main():

    # Parse command line arguments
    argparser = argparse.ArgumentParser()
    argparser.add_argument('--train_set', type=str, required=True)
    argparser.add_argument('--valid_set', type=str, required=True)
    argparser.add_argument('--model', type=str, default="gru")
    argparser.add_argument('--model_file', type=str, default='None')
    argparser.add_argument('--n_epochs', type=int, default=30)
    argparser.add_argument('--hidden_size', type=int, default=200)
    argparser.add_argument('--n_layers', type=int, default=3)
    argparser.add_argument('--learning_rate', type=float, default=0.01)
    argparser.add_argument('--chunk_len', type=int, default=200)
    argparser.add_argument('--batch_size', type=int, default=300)
    argparser.add_argument('--num_workers', type=int, default=8)
    argparser.add_argument('--cuda', action='store_true')
    argparser.add_argument('--cpu', action='store_true')
    args = argparser.parse_args()

    # Initialize models and start training

    if args.model_file == 'None':
        decoder = CharRNN(
            n_characters,
            args.hidden_size,
            n_characters,
            model=args.model,
            n_layers=args.n_layers,
        )
        epoch_from = 1
        prev_valid_loss = sys.maxsize
        old_filename = None
    else:
        if args.cpu:
            decoder = torch.load(args.model_file,
                                 map_location=lambda storage, loc: storage)
        else:
            decoder = torch.load(args.model_file)
        info = args.model_file.split('_')
        args.model = info[0]
        epoch_from = int(info[1][5:]) + 1
        args.n_layers = int(info[2][7:])
        args.hidden_size = int(info[5][2:])
        prev_valid_loss = float(info[7][4:-3])
        old_filename = args.model_file

        print(
            "successfully loaded model! Continuing from epoch {0} with valid loss {1}"
            .format(epoch_from, prev_valid_loss))

    optimizer = torch.optim.Adam(decoder.parameters(), lr=args.learning_rate)
    criterion = nn.CrossEntropyLoss()

    if args.cuda:
        decoder.cuda()

    start = time.time()

    train_dataset = WordDataset(args.train_set, args.chunk_len)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  num_workers=args.num_workers,
                                  drop_last=True)

    valid_dataset = WordDataset(args.valid_set, args.chunk_len)
    valid_dataloader = DataLoader(valid_dataset,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  num_workers=args.num_workers,
                                  drop_last=True)

    try:

        print('Training for maximum {} epochs...'.format(args.n_epochs))
        for epoch in range(epoch_from, args.n_epochs + 1):

            train_loss, num_samples = 0, 0
            for s in tqdm(train_dataloader):
                input_, target = prep_data(s['input'], s['target'], args.cuda)
                train_loss += train(decoder, optimizer, criterion, input_,
                                    target, args.batch_size, args.chunk_len,
                                    args.cuda)
                num_samples += 1
            train_loss /= num_samples

            valid_loss, num_samples = 0, 0
            for s in valid_dataloader:
                input_, target = prep_data(s['input'], s['target'], args.cuda)
                valid_loss += evaluate(decoder, criterion, input_, target,
                                       args.batch_size, args.chunk_len,
                                       args.cuda)
                num_samples += 1
            valid_loss /= num_samples

            elapsed = time_since(start)
            pcnt = epoch / args.n_epochs * 100
            log = (
                '{} elapsed - epoch #{} ({:.1f}%) - training loss (BPC) {:.2f} '
                '- validation loss (BPC) {:.2f}')
            print(log.format(elapsed, epoch, pcnt, train_loss, valid_loss))

            if valid_loss > prev_valid_loss:
                print('No longer learning, just overfitting, stopping here.')
                break
            else:
                filename = model_file_name(decoder, epoch, train_loss,
                                           valid_loss)
                torch.save(decoder, filename)
                print('Saved as {}'.format(filename))
                if old_filename:
                    os.remove(old_filename)
                old_filename = filename

            prev_valid_loss = valid_loss

    except KeyboardInterrupt:
        print("Saving before quit...")
        try:
            valid_loss
        except:
            valid_loss = 'no_val'
        filename = model_file_name(decoder, epoch, train_loss, valid_loss)
        torch.save(decoder, filename)
        print('Saved as {}'.format(filename))
Exemple #4
0
import pickle
import sqlite3
import pandas as pd

from helpers import prep_data

con = sqlite3.connect('data/hockey.db')

player_id = 'ovechal01'

new = pd.read_sql(
    f'''
    select
    *
    from players
    where player_id = "{player_id}"
    order by date asc
    limit 5
''', con)

X = prep_data(new)

with open('pickles/pipe.pkl', 'rb') as f:
    pipe = pickle.load(f)

pipe.predict(X)[0]
Exemple #5
0
def predict(player_id):
    X = fetch_player_data(player_id)
    X = prep_data(X)
    X = round(pipe.predict(X)[0], 2)
    return X
Exemple #6
0
    def predict(self, payload): # recieves userid, outputs recommendation_id
        """Called once per request. Runs preprocessing of the request payload, inference, and postprocessing of the inference output. Required.

        Args:
            payload: The parsed JSON request payload.

        Returns:
            Prediction or a batch of predictions.
        """

        self.model.connect_db()
        user_id = payload
        
        """ testing with local letterboxd data """
        
#         ratings = pd.read_csv('exported_data/imdb/riley_imdb_ratings.csv', engine='python')
#         ratings = pd.read_csv('exported_data/letterboxd/riley/ratings.csv',  engine='python')
        
    
        query = "SELECT EXISTS(SELECT 1 FROM user_letterboxd_ratings where user_id=%s);" 
        self.model.cursor_dog.execute(query, (user_id,))
        boolean = self.model.cursor_dog.fetchall()
        
        if boolean[0][0]==False: # True
            self.model.cursor_dog.close()
            self.model.connection.close()
            return "user_id not found"
    
    
        self.model.cursor_dog.execute("SELECT date, name, year, letterboxd_uri, rating FROM user_letterboxd_ratings WHERE user_id=%s;", (user_id,))
        ratings_sql= self.model.cursor_dog.fetchall()
        ratings = pd.DataFrame(ratings_sql, columns = ['Date', 'Name', 'Year', 'Letterboxd URI', 'Rating'])
        ratings= ratings.dropna()
        

#         self.model.cursor_dog.execute("SELECT * FROM test_watchlist WHERE user_id=%s;", (user_id,))
#         watchlist_sql= self.model.cursor_dog.fetchall()
#         watchlist = pd.DataFrame(watchlist_sql, columns = ['Date', 'Name', 'Year', 'Letterboxd URI', 'user_id'])
#         watchlist = watchlist.dropna()
        

#         self.model.cursor_dog.execute("SELECT * FROM test_watched WHERE user_id=%s;", (user_id,))
#         watched_sql= self.model.cursor_dog.fetchall()
#         watched = pd.DataFrame(watched_sql, columns = ['Date', 'Name', 'Year', 'Letterboxd URI', 'user_id'])
#         watched = watched.dropna()
        

#         self.model.cursor_dog.execute("SELECT * FROM test_title_basics_small;")
#         title_basics_small_sql= self.model.cursor_dog.fetchall()
#         id_book = pd.DataFrame(title_basics_small_sql, columns = ['tconst', 'primaryTitle', 'originalTitle', 'startYear'])
#         id_book = id_book.dropna()
        
        """ Prepare data  """
        good_list, bad_list, hist_list, val_list, ratings_dict = prep_data(
                                    ratings, watched_df=None, watchlist_df=None, good_threshold=3, bad_threshold=2) 
        
        """ Load JSON into a list (if applicable) """ 
        # payload_jsonified = json.dumps(payload)
        # movie_dict = json.loads(payload_jsonified)
        # movie_list = list(movie_dict.values())
        
        """ Run prediction with parameters """
        
        predictions = self.model.predict(good_list, bad_list, hist_list, val_list, ratings_dict, n=20, harshness=4, rec_movies=True, scoring=True,)
        
        """ Turn predictions into JSON """
        
        names = ['Title', 'Year', 'IMDB URL', 'Average Rating', 'Number of Votes', 'Similarity Score', 'IMDB ID']
        names_lists = {key:[] for key in names}
        
        for x in range(0, len(predictions[0])):
            for y in range(0, len(predictions)):
                names_lists[names[x]].append(predictions[y][x])
                
        results_dict = [dict(zip(names_lists,t)) for t in zip(*names_lists.values())]
        recommendation_json = json.dumps(results_dict)
        
        
        """ Commit to the database """
        
        string_json = str(recommendation_json)
        hash_object = hashlib.md5(string_json.encode('ascii'))
        recommendation_id = hash_object.hexdigest()
        
        query = "SELECT EXISTS(SELECT 1 FROM recommendations where recommendation_id=%s);" 
        self.model.cursor_dog.execute(query, (recommendation_id,))
        boolean = self.model.cursor_dog.fetchall()
        date = datetime.now()
        
        if boolean[0][0]: # True
            self.model.cursor_dog.close()
            self.model.connection.close()
            return "Already recommended", recommendation_json
        else:
            query = "INSERT INTO recommendations(user_id, recommendation_id, recommendation_json, date) VALUES (%s, %s, %s, %s);"
            self.model.cursor_dog.execute(query, (user_id, recommendation_id, recommendation_json, date))
            self.model.connection.commit()
            self.model.cursor_dog.close()
            self.model.connection.close()
            return "Recommendation committed to DB with id:", recommendation_id