def computeHistograms(): """Calculate color histograms for all covers.""" imgPath = cfg.databasePath() + '/Covers' config = cfg.readConfig() hBins = config.getint('Images', 'h_bins') sBins = config.getint('Images', 's_bins') with open(cfg.databasePath() + '/Sets.json', 'r') as setsFile: sets = json.load(setsFile) coverIds = [int(x.split('.')[0]) for x in os.listdir(imgPath)] for curSet in sets: idList = [x for x in sets[curSet] if x in coverIds] data = np.zeros((len(idList), hBins * sBins + 1)) data[:, 0] = idList for i in range(len(idList)): coverFilename = imgPath + '/{}.jpg'.format(idList[i]) img = cv2.imread(coverFilename, cv2.IMREAD_COLOR) if img is None or img.shape[2] != 3: print('Skipping ' + str(idList[i])) continue img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) for j in range(sBins): minS = j * 256 / sBins maxS = (j + 1) * 256 / sBins relPix = img[np.logical_and(img[:, :, 1] > minS, img[:, :, 1] < maxS)] data[i, 1+hBins*j:1+hBins*(j+1)], _ = \ np.histogram(relPix[:, 0], bins=hBins) data[:, 1:] = data[:, 1:] / np.sum(data[:, 1:], axis=1)[:, np.newaxis] data = data[~np.isnan(data).any(axis=1)] np.savetxt(cfg.databasePath() + '/{}_hist.csv'.format(curSet), data)
def buildBagOfWords(): """Transform image features to a BoW vector.""" featPath = cfg.databasePath() + '/Features' config = cfg.readConfig() k = config.getint('Images', 'n_clusters') centers = np.loadtxt(cfg.databasePath() + '/Centers.csv') searcher = NearestNeighbors(n_neighbors=1, n_jobs=-1) searcher.fit(centers) with open(cfg.databasePath() + '/Sets.json', 'r') as setsFile: sets = json.load(setsFile) for curSet in sets: if os.path.exists(cfg.databasePath() + '/{}_img.csv'.format(curSet)): print('Skipping {} set.'.format(curSet)) continue idList = [ x for x in sets[curSet] if os.path.exists(featPath + '/{}.npy'.format(x)) ] data = np.zeros((len(idList), k + 1)) data[:, 0] = idList for i in range(len(idList)): features = np.load(featPath + '/{}.npy'.format(idList[i])) assignments = searcher.kneighbors(features, return_distance=False) for item in assignments: data[i, item + 1] += 1 data[:, 1:] = data[:, 1:] / np.sum(data[:, 1:], axis=1)[:, np.newaxis] np.savetxt(cfg.databasePath() + '/{}_img.csv'.format(curSet), data)
def downloadSwitch(): """Download games for the Nintendo Switch platform.""" if not os.path.exists(cfg.databasePath() + '/Games'): os.makedirs(cfg.databasePath() + '/Games') config = cfg.readConfig() fields = config['Database']['fields'].split(',') api = igdb(config['Database']['api_key']) res = api.games({ 'fields': fields, 'filters': { '[release_dates.platform][any]': 130 }, 'scroll': 1, 'limit': 50 }) for game in res.body: filename = cfg.databasePath() + '/Games/{}.json'.format(game['id']) with open(filename, 'w') as outFile: json.dump(game, outFile, indent='\t') nPages = round(int(res.headers['X-Count']) / 50) for _ in range(nPages): scrolled = api.scroll(res) if type(scrolled.body) is list: for game in scrolled.body: filename = cfg.databasePath() + '/Games/{}.json'.format(game['id']) with open(filename, 'w') as outFile: json.dump(game, outFile, indent='\t')
def splitDatabase(): """Split database into train, validation and test sets.""" config = cfg.readConfig() dataPath = cfg.databasePath() + '/Games' idList = [] for file in os.listdir(dataPath): if os.path.isfile(dataPath + '/' + file): idList.append(int(file.split('.')[0])) trainSize = int(config.getfloat('Database', 'train_size') * len(idList)) validSize = int(config.getfloat('Database', 'valid_size') * len(idList)) sets = {} sets['train'] = random.sample(idList, trainSize) idList = [x for x in idList if x not in sets['train']] sets['valid'] = random.sample(idList, validSize) sets['test'] = [x for x in idList if x not in sets['valid']] with open(cfg.databasePath() + '/Sets.json', 'w') as outFile: json.dump(sets, outFile)
def clusterFeatures(): """Perform clustering on the extracted features.""" featPath = cfg.databasePath() + '/Features' with open(cfg.databasePath() + '/Sets.json', 'r') as setsFile: idList = json.load(setsFile)['train'] arrays = (np.load(featPath + '/' + str(id) + '.npy', allow_pickle=False) for id in idList if os.path.exists(featPath + '/' + str(id) + '.npy')) features = np.vstack(arrays) print(features.shape) print('Loading complete') config = cfg.readConfig() k = config.getint('Images', 'n_clusters') model = MiniBatchKMeans(n_clusters=k, batch_size=50000, verbose=True, compute_labels=False) model.fit(features) np.savetxt(cfg.databasePath() + '/Centers.csv', model.cluster_centers_)
def encodeData(): """Encode game metadata.""" config = cfg.readConfig() esrbCode = config.get('Preprocessing', 'esrb').split(',') modesCode = config.get('Preprocessing', 'game_modes').split(',') genresCode = config.get('Preprocessing', 'genres').split(',') themesCode = config.get('Preprocessing', 'themes').split(',') modesCode = [int(x) for x in modesCode] genresCode = [int(x) for x in genresCode] themesCode = [int(x) for x in themesCode] dim2 = sum([len(esrbCode), len(modesCode), len(genresCode), len(themesCode)]) with open(cfg.databasePath() + '/Sets.json', 'r') as setsFile: sets = json.load(setsFile) for set in sets: idList = sets[set] data = np.zeros((len(idList), dim2 + 1), dtype=int) data[:, 0] = idList y = np.zeros((len(idList), 2)) y[:, 0] = idList for i in range(len(idList)): p = 1 gamePath = cfg.databasePath() + '/Games/{}.json'.format(idList[i]) with open(gamePath, 'r') as gameFile: gameData = json.load(gameFile) esrb = gameData['esrb']['rating'] if 'esrb' in gameData else 1 data[i, p:p+len(esrbCode)] = \ [1 if esrb - 1 == i else 0 for i in range(len(esrbCode))] p += len(esrbCode) modes = gameData['game_modes'] if 'game_modes' in gameData else [] data[i, p:p+len(modesCode)] = encodeMultiLabel(modes, modesCode) p += len(modesCode) genres = gameData['genres'] if 'genres' in gameData else [] data[i, p:p+len(genresCode)] = encodeMultiLabel(genres, genresCode) p += len(genresCode) themes = gameData['themes'] if 'themes' in gameData else [] data[i, p:p+len(themesCode)] = encodeMultiLabel(themes, themesCode) y[i, 1] = gameData['aggregated_rating'] np.savetxt(cfg.databasePath() + '/{}_data.csv'.format(set), data, fmt='%d') np.savetxt(cfg.databasePath() + '/{}_y.csv'.format(set), y, fmt='%.2f')
def vectorizeSummaries(): """Create word count matrix from game summaries.""" with open(cfg.configPath() + '/Vocabulary.txt', 'r') as inFile: vocab = [x.rstrip() for x in inFile] config = cfg.readConfig() params = config['Text'] vectorizer = CountVectorizer(**params) vectorizer.set_params(vocabulary=vocab) with open(cfg.databasePath() + '/Sets.json', 'r') as setsFile: sets = json.load(setsFile) ids = [] summaries = [] for id in sets['train']: with open(cfg.databasePath() + '/Games/{}.json'.format(id), 'r') as inFile: gameData = json.load(inFile) if 'summary' in gameData: ids.append(id) summaries.append(gameData['summary']) vectorizer.fit(summaries) train = np.zeros((len(ids), 1 + len(vectorizer.get_feature_names()))) train[:, 0] = ids train[:, 1:] = vectorizer.transform(summaries).todense() np.savetxt(cfg.databasePath() + '/train_text.csv', train, fmt='%d') ids = [] summaries = [] for id in sets['valid']: with open(cfg.databasePath() + '/Games/{}.json'.format(id), 'r') as inFile: gameData = json.load(inFile) if 'summary' in gameData: ids.append(id) summaries.append(gameData['summary']) valid = np.zeros((len(ids), 1 + len(vectorizer.get_feature_names()))) valid[:, 0] = ids valid[:, 1:] = vectorizer.transform(summaries).todense() np.savetxt(cfg.databasePath() + '/valid_text.csv', valid, fmt='%d') ids = [] summaries = [] for id in sets['test']: with open(cfg.databasePath() + '/Games/{}.json'.format(id), 'r') as inFile: gameData = json.load(inFile) if 'summary' in gameData: ids.append(id) summaries.append(gameData['summary']) test = np.zeros((len(ids), 1 + len(vectorizer.get_feature_names()))) test[:, 0] = ids test[:, 1:] = vectorizer.transform(summaries).todense() np.savetxt(cfg.databasePath() + '/test_text.csv', test, fmt='%d') ids = [] summaries = [] for id in sets['rank_train']: with open(cfg.databasePath() + '/Games/{}.json'.format(id), 'r') as inFile: gameData = json.load(inFile) if 'summary' in gameData: ids.append(id) summaries.append(gameData['summary']) test = np.zeros((len(ids), 1 + len(vectorizer.get_feature_names()))) test[:, 0] = ids test[:, 1:] = vectorizer.transform(summaries).todense() np.savetxt(cfg.databasePath() + '/rank_train_text.csv', test, fmt='%d') ids = [] summaries = [] for id in sets['rank_test']: with open(cfg.databasePath() + '/Games/{}.json'.format(id), 'r') as inFile: gameData = json.load(inFile) if 'summary' in gameData: ids.append(id) summaries.append(gameData['summary']) test = np.zeros((len(ids), 1 + len(vectorizer.get_feature_names()))) test[:, 0] = ids test[:, 1:] = vectorizer.transform(summaries).todense() np.savetxt(cfg.databasePath() + '/rank_test_text.csv', test, fmt='%d')