コード例 #1
0
ファイル: predict_rate.py プロジェクト: hy950921/cs584-hw4
def main():
  timeStart = time()
  print('Pre-processing cf data...')
  train_data = readFile('./data/train.csv', separator=' ', columns=['userID', 'movieID', 'rating'],
                       types={'userID': np.int32, 'movieID': np.int32, 'rating': np.float32})
  train_data = train_data.sample(n=len(train_data))
  test_data = readFile('./data/test.csv', separator=' ', columns=['userID', 'movieID'],
                      types={'userID': np.int32, 'movieID': np.int32})

  reader = Reader(rating_scale=(0, 5))
  train_data = Dataset.load_from_df(train_data, reader)
  model = SVD(n_factors=5, n_epochs=50)
  train_data = train_data.build_full_trainset()
  cf_predictions = cf_predict(train_data, test_data, model)
  print('\ncf train-predict costs (%d seconds)' % (time() - timeStart))
  with open("./data/train.csv", "r") as data:
    data = data.readlines()
    for row in data[1:]:
      tokens = word_tokenize(row)
      if (user_movie_rating.get(int(tokens[0])) == None):
        user_movie_rating[int(tokens[0])] = []
      user_movie_rating[int(tokens[0])].append((int(tokens[1]), float(tokens[2])))
  fill_content_dict("./data/additional_files/movie_genres.data", movie_genres)
  fill_content_dict("./data/additional_files/movie_directors.data", movie_directors)
  fill_content_dict("./data/additional_files/movie_actors.data", movie_actors)
  fill_content_dict("./data/additional_files/movie_tags.data", movie_tags)
  print("Done! (%d seconds)\n" % (time() - timeStart))
  print("Predicting...")
  timePred = time()
  content_predictions = []
  with open("./data/test.csv", "r") as testFile:
    for row in testFile.readlines()[1:]:
      tokens = word_tokenize(row)
      tmp = contentBased_predict(int(tokens[0]), int(tokens[1]))
      content_predictions.append(contentBased_predict(int(tokens[0]), int(tokens[1])))
  print("Done! (%d seconds)\n" % (time() - timePred))
  print("Writing to file...")
  with open("./data/res.data", 'w') as predFile:
    for p1, p2 in zip(content_predictions, cf_predictions):
      predFile.write("%f\n" % ((float(p1)+float(p2))/2))
コード例 #2
0
def main():
    timeStart = time()
    #preprocess data

    print('Pre-processing cf data...')
    # Read train (shuffled) and test data as DataFrames
    train_data = readFile('./data/train.csv',
                          separator=' ',
                          columns=['userID', 'movieID', 'rating'],
                          types={
                              'userID': np.int32,
                              'movieID': np.int32,
                              'rating': np.float32
                          })
    train_data = train_data.sample(n=len(train_data))
    test_data = readFile('./data/test.csv',
                         separator=' ',
                         columns=['userID', 'movieID'],
                         types={
                             'userID': np.int32,
                             'movieID': np.int32
                         })

    # Build the train data as a Surprise's DataSet object
    reader = Reader(rating_scale=(0, 5))  # Standardized rating scale
    train_data = Dataset.load_from_df(train_data, reader)

    model = SVD(n_factors=5, n_epochs=50)

    # Build a Trainset object to feed into the prediction algorithm.
    train_data = train_data.build_full_trainset()

    # Predict ratings for each user and associated movie
    cf_predictions = cf_predict(train_data, test_data, model)

    print('\ncf train-predict costs (%d seconds)' % (time() - timeStart))

    #create dict {userID: [(movieID1, rating1), (movieID2, rating2), ...]}
    #only includes ratings for movies that the user has seen
    with open("./data/train.csv", "r") as data:
        data = data.readlines()
        for row in data[1:]:
            tokens = word_tokenize(row)
            if (user_movie_rating.get(int(tokens[0])) == None):
                user_movie_rating[int(tokens[0])] = []
            user_movie_rating[int(tokens[0])].append(
                (int(tokens[1]), float(tokens[2])))

    #create dict {movieID: [genre1, genre2, ...]}
    fill_content_dict("./data/additional_files/movie_genres.data",
                      movie_genres)

    #create dict {movieID: director}
    fill_content_dict("./data/additional_files/movie_directors.data",
                      movie_directors)

    #create dict {movieID: [actor1, actor2, ...]}
    fill_content_dict("./data/additional_files/movie_actors.data",
                      movie_actors)

    #create dict {movieID: [tag1, tag2, ...]}
    fill_content_dict("./data/additional_files/movie_tags.data", movie_tags)

    print("Done! (%d seconds)\n" % (time() - timeStart))

    #predict ratings
    print("Predicting...")
    timePred = time()
    content_predictions = []
    with open("./data/test.csv", "r") as testFile:
        for row in testFile.readlines()[1:]:
            tokens = word_tokenize(row)
            content_predictions.append(
                contentBased_predict(int(tokens[0]), int(tokens[1])))

    print("Done! (%d seconds)\n" % (time() - timePred))

    #write results to file
    print("Writing to file...")
    timeWrite = time()
    with open("./data/res.data", 'w') as predFile:
        for p1, p2 in zip(content_predictions, cf_predictions):
            predFile.write("%f\n" % ((float(p1) + float(p2)) / 2))