Example #1
0
def remove_top_percentile(data):
    counts = np.bincount(data['user_id'])
    q = [50.0, 90.0, 99.0, 99.9, 99.99, 99.999]
    percentiles = np.percentile(counts, q, interpolation='nearest')
    print_flush('Percentiles for number of ratings per user:'******'  {}: {}'.format(a, b))
    # plt.hist(counts, bins=np.logspace(0., np.log10(np.max(counts)) , 20), normed=1, cumulative=True)
    # plt.gca().set_xscale("log")
    # plt.show()
    max_items_per_user = percentiles[3] + 1  # 99.9%
    return remove_extreme_users(data, 2, max_items_per_user)
Example #2
0
def constant_user_length(data, n):
    print_flush('Creating constant user length = {}'.format(n))
    data = remove_extreme_users(data, n, 1000)
    # sort by user_id, but within each user have a random order
    data = np.random.permutation(data)
    data = data[data['user_id'].argsort()]
    user_ids = data['user_id']
    # keep n last ratings from each user
    # due to the random permutation, it's n random ratings from each user
    take_last = n
    shifted_user_ids = np.append(user_ids[take_last:], [-1]*take_last)
    is_last = (user_ids != shifted_user_ids)
    data = data[is_last]
    print_flush('Left with {} ratings'.format(len(data)))
    return data
Example #3
0
 def from_data(cls, data, p_val=0.1, p_test=0.1, give_first=None, take_last=None):
     correct_type = np.dtype([('user_id', '<i4'), ('item_id', '<i4'), ('rating', '<f4'), ('timestamp', '<i8')])
     assert data.dtype == correct_type, data.dtype
     assert p_val >= 0
     assert p_test >= 0
     assert p_val+p_test <= 1
     assert not (give_first and take_last)
     if not give_first and not take_last:
         take_last = 1
     num_users = np.max(data['user_id']) + 1
     num_items = np.max(data['item_id']) + 1
     # we allow users/items without any appearances, so the following assert is incorrect
     # assert len(set(data['user_id'])) == num_users, (len(set(data['user_id'])), num_users)
     # assert len(set(data['item_id'])) == num_items, (len(set(data['item_id'])), num_items)
     sorted_data = np.sort(data, order=['user_id', 'timestamp'])
     sorted_data = sorted_data[['user_id', 'item_id', 'rating']]
     # print some stats
     counts = np.bincount(sorted_data['user_id'])
     nonzero_counts = counts[counts > 0]
     print_flush('Items per user min/mean/max: {}/{:.2f}/{}'.format(
         np.min(nonzero_counts), np.mean(nonzero_counts), np.max(nonzero_counts)
     ))
     # split training/validation:
     # take either a constant number for the training set, or a constant number for the val/test sets,
     # depending on the given options
     # calculate mask for last items of each user
     user_ids = sorted_data['user_id']
     if take_last:
         shifted_user_ids = np.append(user_ids[take_last:], [-1]*take_last)
         is_last = (user_ids != shifted_user_ids)
         assert np.sum(is_last) == len(set(data['user_id'])) * take_last, 'take_last ({}) is bigger than the smallest user'.format(take_last)
     else:
         shifted_user_ids = np.append([-1]*give_first, user_ids[:-give_first])
         is_last = (user_ids == shifted_user_ids)
         assert np.sum(~is_last) == len(set(data['user_id'])) * give_first, 'give_first ({}) is bigger than the smallest user'.format(give_first)
     # p_val go to val, p_test to test
     r = np.random.rand(*is_last.shape)
     is_val   =  is_last & ((0.0 <= r) & (r < p_val))
     is_test  =  is_last & ((p_val <= r) & (r < p_val+p_test))
     is_train = ~is_last | ((p_val+p_test <= r) & (r < 1.0))
     train = sorted_data[is_train]
     val   = sorted_data[is_val]
     test  = sorted_data[is_test]
     assert len(train) + len(val) + len(test) == len(sorted_data)
     return cls(num_users, num_items, train, val, test)
Example #4
0
 def __init__(self, data_file, word2id, num_scenes, words_per_scene, max_movies=None, verbose=True):
     if verbose:
         print_flush('Loading subtitles...')
     self.subs = dict()
     with open(data_file) as f:
         for i,line in enumerate(f):
             if verbose and i % 500 == 0:
                 print_flush('{}...'.format(i))
             if i == max_movies:
                 break
             line = line.replace("'", " ' ") # TODO
             words = line.strip().lower().split()
             movie_id = int(words[0])
             text = ' '.join(words[1:])
             encoded_scenes = np.zeros([num_scenes, words_per_scene], dtype=np.uint32)
             # padding has id 0, so we don't need to do anything for it
             scenes = text.split('</scene>')
             unk_id = word2id['<UNK>']
             def id_getter(word):
                 return (word2id[word] if word in word2id else unk_id)
             for j,scene in enumerate(scenes):
                 if j == num_scenes:
                     break
                 words = scene.split()[:words_per_scene]
                 encoded_scenes[j,:len(words)] = list(map(id_getter, words))
             self.subs[movie_id] = encoded_scenes
     if verbose:
         print_flush('Done')
Example #5
0
 def train_batch_iter(self, min_batch_size, num_epochs):
     """
     Generates batches for the training data.
     min_batch_size (and not batch_size), since we want all of each user's datapoints in a single batch
     """
     assert min_batch_size >= 2
     data = self.train
     n = len(data)
     user_ids = data['user_id']
     print_flush('About {} steps per epoch'.format(len(data) // min_batch_size))
     for epoch in range(num_epochs):
         print_flush('Starting epoch {} out of {}'.format(epoch+1, num_epochs))
         next_start = 0
         while next_start < n:
             start = next_start
             end = min(start + min_batch_size, n)
             while end < n:
                 if user_ids[end-1] != user_ids[end]:
                     # seeing a new user
                     break
                 end += 1
             batch = data[start:end]
             next_start = end
             yield self.get_batch(batch)
Example #6
0
def remove_extreme_users(data, min_ratings, max_ratings):
    print_flush('Removing all users with less than {} or more than {} ratings'.format(min_ratings, max_ratings))
    counts = np.bincount(data['user_id'])
    too_much   = set(np.flatnonzero(counts > max_ratings))
    too_little = set(np.flatnonzero(counts < min_ratings))
    zero       = set(np.flatnonzero(counts == 0))
    bad_user = (too_much | too_little) - zero
    print_flush('Removing {} users'.format(len(bad_user)))
    data = np.array([x for x in data if x['user_id'] not in bad_user])
    print_flush('Left with {} ratings'.format(len(data)))
    return data
Example #7
0
 def __init__(self, path, max_vocab_size=None, verbose=True):
     if verbose:
         print_flush('Loading word embedding...')
     self.vocab = IdAssigner()
     with gzip.open(path, 'rb') as f:
         word_count, dim = map(int, f.readline().split())
         # +2 for the "padding" and "unknown" words
         word_count += 2
         if max_vocab_size:
             word_count = min(word_count, max_vocab_size)
         self.dimension = dim
         self.embedding = np.zeros([word_count, dim], dtype=np.float32)
         # First word is "padding"
         pad_id = self.vocab.get_id('<PAD>')
         assert pad_id == 0
         # default in self.embedding is already zeros
         # self.embedding[pad] = np.zeros([dim], dtype=np.float32)
         # Second word is "out of vocabulary"
         unk_id = self.vocab.get_id('<UNK>')
         assert unk_id == 1
         # default in self.embedding is already zeros
         # self.embedding[unk_id] = np.zeros([dim], dtype=np.float32)
         if verbose:
             print_flush(
                 'Loading {} words with embedding dimension {}'.format(
                     word_count, dim))
         format_string = 'f' * dim
         sz = struct.calcsize(format_string)
         i = self.vocab.get_next_id()
         while True:
             encoded = b''
             c = f.read(1)
             while c != ' '.encode():
                 encoded += c
                 c = f.read(1)
             try:
                 word = encoded.decode('utf-8')
             except UnicodeDecodeError as e:
                 if verbose:
                     print_flush(
                         'Error decoding this sequence: {} (skipping).'.
                         format(encoded))
             if '_' in word:
                 # only load single words, not phrases
                 f.seek(sz, 1)
                 continue
             word_lower = word.lower()
             if word_lower in self.vocab.forward:
                 # already saw this word
                 f.seek(sz, 1)
                 continue
             word_id = self.vocab.get_id(word_lower)
             val = np.array(struct.unpack(format_string, f.read(sz)),
                            dtype=np.float32)
             self.embedding[word_id] = val
             if verbose and i % 10000 == 0:
                 print_flush('{}... ({})'.format(i, word_lower))
             i += 1
             if i == max_vocab_size:
                 break
         #assert f.read() == ''
     if verbose:
         print_flush('Done. Loaded {} words in total'.format(i))