def remove_top_percentile(data): counts = np.bincount(data['user_id']) q = [50.0, 90.0, 99.0, 99.9, 99.99, 99.999] percentiles = np.percentile(counts, q, interpolation='nearest') print_flush('Percentiles for number of ratings per user:'******' {}: {}'.format(a, b)) # plt.hist(counts, bins=np.logspace(0., np.log10(np.max(counts)) , 20), normed=1, cumulative=True) # plt.gca().set_xscale("log") # plt.show() max_items_per_user = percentiles[3] + 1 # 99.9% return remove_extreme_users(data, 2, max_items_per_user)
def constant_user_length(data, n): print_flush('Creating constant user length = {}'.format(n)) data = remove_extreme_users(data, n, 1000) # sort by user_id, but within each user have a random order data = np.random.permutation(data) data = data[data['user_id'].argsort()] user_ids = data['user_id'] # keep n last ratings from each user # due to the random permutation, it's n random ratings from each user take_last = n shifted_user_ids = np.append(user_ids[take_last:], [-1]*take_last) is_last = (user_ids != shifted_user_ids) data = data[is_last] print_flush('Left with {} ratings'.format(len(data))) return data
def from_data(cls, data, p_val=0.1, p_test=0.1, give_first=None, take_last=None): correct_type = np.dtype([('user_id', '<i4'), ('item_id', '<i4'), ('rating', '<f4'), ('timestamp', '<i8')]) assert data.dtype == correct_type, data.dtype assert p_val >= 0 assert p_test >= 0 assert p_val+p_test <= 1 assert not (give_first and take_last) if not give_first and not take_last: take_last = 1 num_users = np.max(data['user_id']) + 1 num_items = np.max(data['item_id']) + 1 # we allow users/items without any appearances, so the following assert is incorrect # assert len(set(data['user_id'])) == num_users, (len(set(data['user_id'])), num_users) # assert len(set(data['item_id'])) == num_items, (len(set(data['item_id'])), num_items) sorted_data = np.sort(data, order=['user_id', 'timestamp']) sorted_data = sorted_data[['user_id', 'item_id', 'rating']] # print some stats counts = np.bincount(sorted_data['user_id']) nonzero_counts = counts[counts > 0] print_flush('Items per user min/mean/max: {}/{:.2f}/{}'.format( np.min(nonzero_counts), np.mean(nonzero_counts), np.max(nonzero_counts) )) # split training/validation: # take either a constant number for the training set, or a constant number for the val/test sets, # depending on the given options # calculate mask for last items of each user user_ids = sorted_data['user_id'] if take_last: shifted_user_ids = np.append(user_ids[take_last:], [-1]*take_last) is_last = (user_ids != shifted_user_ids) assert np.sum(is_last) == len(set(data['user_id'])) * take_last, 'take_last ({}) is bigger than the smallest user'.format(take_last) else: shifted_user_ids = np.append([-1]*give_first, user_ids[:-give_first]) is_last = (user_ids == shifted_user_ids) assert np.sum(~is_last) == len(set(data['user_id'])) * give_first, 'give_first ({}) is bigger than the smallest user'.format(give_first) # p_val go to val, p_test to test r = np.random.rand(*is_last.shape) is_val = is_last & ((0.0 <= r) & (r < p_val)) is_test = is_last & ((p_val <= r) & (r < p_val+p_test)) is_train = ~is_last | ((p_val+p_test <= r) & (r < 1.0)) train = sorted_data[is_train] val = sorted_data[is_val] test = sorted_data[is_test] assert len(train) + len(val) + len(test) == len(sorted_data) return cls(num_users, num_items, train, val, test)
def __init__(self, data_file, word2id, num_scenes, words_per_scene, max_movies=None, verbose=True): if verbose: print_flush('Loading subtitles...') self.subs = dict() with open(data_file) as f: for i,line in enumerate(f): if verbose and i % 500 == 0: print_flush('{}...'.format(i)) if i == max_movies: break line = line.replace("'", " ' ") # TODO words = line.strip().lower().split() movie_id = int(words[0]) text = ' '.join(words[1:]) encoded_scenes = np.zeros([num_scenes, words_per_scene], dtype=np.uint32) # padding has id 0, so we don't need to do anything for it scenes = text.split('</scene>') unk_id = word2id['<UNK>'] def id_getter(word): return (word2id[word] if word in word2id else unk_id) for j,scene in enumerate(scenes): if j == num_scenes: break words = scene.split()[:words_per_scene] encoded_scenes[j,:len(words)] = list(map(id_getter, words)) self.subs[movie_id] = encoded_scenes if verbose: print_flush('Done')
def train_batch_iter(self, min_batch_size, num_epochs): """ Generates batches for the training data. min_batch_size (and not batch_size), since we want all of each user's datapoints in a single batch """ assert min_batch_size >= 2 data = self.train n = len(data) user_ids = data['user_id'] print_flush('About {} steps per epoch'.format(len(data) // min_batch_size)) for epoch in range(num_epochs): print_flush('Starting epoch {} out of {}'.format(epoch+1, num_epochs)) next_start = 0 while next_start < n: start = next_start end = min(start + min_batch_size, n) while end < n: if user_ids[end-1] != user_ids[end]: # seeing a new user break end += 1 batch = data[start:end] next_start = end yield self.get_batch(batch)
def remove_extreme_users(data, min_ratings, max_ratings): print_flush('Removing all users with less than {} or more than {} ratings'.format(min_ratings, max_ratings)) counts = np.bincount(data['user_id']) too_much = set(np.flatnonzero(counts > max_ratings)) too_little = set(np.flatnonzero(counts < min_ratings)) zero = set(np.flatnonzero(counts == 0)) bad_user = (too_much | too_little) - zero print_flush('Removing {} users'.format(len(bad_user))) data = np.array([x for x in data if x['user_id'] not in bad_user]) print_flush('Left with {} ratings'.format(len(data))) return data
def __init__(self, path, max_vocab_size=None, verbose=True): if verbose: print_flush('Loading word embedding...') self.vocab = IdAssigner() with gzip.open(path, 'rb') as f: word_count, dim = map(int, f.readline().split()) # +2 for the "padding" and "unknown" words word_count += 2 if max_vocab_size: word_count = min(word_count, max_vocab_size) self.dimension = dim self.embedding = np.zeros([word_count, dim], dtype=np.float32) # First word is "padding" pad_id = self.vocab.get_id('<PAD>') assert pad_id == 0 # default in self.embedding is already zeros # self.embedding[pad] = np.zeros([dim], dtype=np.float32) # Second word is "out of vocabulary" unk_id = self.vocab.get_id('<UNK>') assert unk_id == 1 # default in self.embedding is already zeros # self.embedding[unk_id] = np.zeros([dim], dtype=np.float32) if verbose: print_flush( 'Loading {} words with embedding dimension {}'.format( word_count, dim)) format_string = 'f' * dim sz = struct.calcsize(format_string) i = self.vocab.get_next_id() while True: encoded = b'' c = f.read(1) while c != ' '.encode(): encoded += c c = f.read(1) try: word = encoded.decode('utf-8') except UnicodeDecodeError as e: if verbose: print_flush( 'Error decoding this sequence: {} (skipping).'. format(encoded)) if '_' in word: # only load single words, not phrases f.seek(sz, 1) continue word_lower = word.lower() if word_lower in self.vocab.forward: # already saw this word f.seek(sz, 1) continue word_id = self.vocab.get_id(word_lower) val = np.array(struct.unpack(format_string, f.read(sz)), dtype=np.float32) self.embedding[word_id] = val if verbose and i % 10000 == 0: print_flush('{}... ({})'.format(i, word_lower)) i += 1 if i == max_vocab_size: break #assert f.read() == '' if verbose: print_flush('Done. Loaded {} words in total'.format(i))