def __init__(self, dataFormat, user_num, item_num, ratingfile, itemfile=None, userfile=None, home_dir="./"): if dataFormat=="movielens": self.parser = MLParser(ratingfile, itemfile, userfile) elif dataFormat=="amazon": self.parser = AmazonParser(ratingfile, itemfile, userfile) else: raise ValueError("Valid dataFormat: movielens, amazon") # nums must be explicitly assigned, since we won't get it from recovered data directly self.USER_NUM = user_num self.ITEM_NUM = item_num self.home_dir = home_dir if not os.path.exists(self.home_dir): os.makedirs(self.home_dir) self.itemfile = itemfile self.ratingfile = ratingfile self.userfile = userfile # holding all underlying data self.rating_list = [] self.attribute_list = [] # holding substitutable training and testing data self.train_matrix = None self.train_item_id_of_col = None self.test_matrix = None self.test_item_id_of_col = None
class Database(object): def __init__(self, dataFormat, user_num, item_num, ratingfile, itemfile=None, userfile=None, home_dir="./"): if dataFormat=="movielens": self.parser = MLParser(ratingfile, itemfile, userfile) elif dataFormat=="amazon": self.parser = AmazonParser(ratingfile, itemfile, userfile) else: raise ValueError("Valid dataFormat: movielens, amazon") # nums must be explicitly assigned, since we won't get it from recovered data directly self.USER_NUM = user_num self.ITEM_NUM = item_num self.home_dir = home_dir if not os.path.exists(self.home_dir): os.makedirs(self.home_dir) self.itemfile = itemfile self.ratingfile = ratingfile self.userfile = userfile # holding all underlying data self.rating_list = [] self.attribute_list = [] # holding substitutable training and testing data self.train_matrix = None self.train_item_id_of_col = None self.test_matrix = None self.test_item_id_of_col = None def load_data(self, sort_data=True, dump_file='dumpable.data'): try: self._recover_dumpable_data(self.home_dir+dump_file) except: self.parser.load_raw_data() self.attribute_list = self.parser.extract_attr_list() self.rating_list = self.parser.extract_rating_list(sort_data) #XXX: seems not necessary to dump #self._dump_rating_list(self.home_dir+dump_file) def make_train_test_matrix(self, train_test_ratio = 0.8): #XXX: merge into load_data()? self.train_matrix, self.train_item_id_of_col, _ = \ self._extract_rating_matrix( \ self._filtered_rating_list(0, int(self.ITEM_NUM*train_test_ratio))) self.test_matrix, self.test_item_id_of_col, _ = \ self._extract_rating_matrix( \ self._filtered_rating_list(int(self.ITEM_NUM*train_test_ratio), self.ITEM_NUM)) def dump_libfm_data(self, train_file=None, test_file=None, add_negative=False, binary=False, shuffle=True, omit_item=True): # output libfm format data from rating matrix # [rating item attribute] if omit_item==False, [rating attribute] if omit_item==True if add_negative: #XXX: do not actually addAllUsers and just output it to save time? train_matrix = self._add_neg_to_matrix(self.train_matrix.copy()) test_matrix = self._add_neg_to_matrix(self.test_matrix.copy(), addAllUsers=True) else: train_matrix = self.train_matrix test_matrix = self.test_matrix if train_file!=None: self._dump_matrix_to_libfm_file(self.home_dir+train_file, train_matrix, self.train_item_id_of_col, binary, shuffle, omit_item) if test_file!=None: self._dump_matrix_to_libfm_file(self.home_dir+test_file, test_matrix, self.test_item_id_of_col, binary, shuffle, omit_item) def load_pred_list(self, pred_file, task, threshold=0.5): #TODO: merge accuracy and regression error calculation inside. if task!='c' and task!='r': raise ValueError("task should be r(regression) or c(classification)") f = open(self.home_dir+pred_file,'r') row, col = self.test_matrix.nonzero() result = [] for i in range(len(row)): pred = f.readline().strip() if task=='c': pred = pred>threshold result.append([row[i], col[i], pred]) f.close() return result def _recover_dumpable_data(self, filename): f = open(filename, mode='rb') self.rating_list = cPickle.load(f) f.close() def _dump_rating_list(self, filename): #XXX: seems not necessary f = open(filename, 'wb') cPickle.dump(self.rating_list, f, protocol=2) f.close() def _add_neg_to_matrix(self, matrix, addAllUsers=False): #assign neg rating to -1 just for storing, output in file will be 0 height, width = matrix.get_shape() count = 0 if addAllUsers: for i in range(height): for j in range(width): if matrix[i, j] == 0: count += 1 matrix[i, j] = -1 if count%20000==0: print count,"negative rating added..." else: target = matrix.nnz while count < target: i = random.randint(0, height-1) j = random.randint(0, width-1) if matrix[i, j] == 0: count += 1 matrix[i, j] = -1 if count%20000==0: print count,"negative rating added..." print count,"negative rating added in total." return matrix def _dump_matrix_to_libfm_file(self, filename, matrix, item_id_of_col, binary, shuffle=True, omit_item=False): row, col = matrix.nonzero() height, width = matrix.get_shape() index = range(len(row)) if shuffle: random.shuffle(index) count = 0 libfm_file = open(filename, 'w') for i in index: x = row[i] y = col[i] rating = matrix[x,y] #assign neg rating to -1 just for storing, output in file will be 0 if rating < 0: rating = 0 if binary and rating>0: rating = 1 libfm_file.write(str(rating)+' '+str(x)+':1') filled_num = height if not omit_item: libfm_file.write(' '+str(y+filled_num)+':1') filled_num += width for attribute_pos in self.attribute_list[item_id_of_col[y]]: libfm_file.write(' '+str(filled_num+attribute_pos)+':1') libfm_file.write('\n') count += 1 if count%20000==0: print filename,count,'lines written...' libfm_file.close() print filename,count,'lines written in total.' def _filtered_rating_list(self, head_item_count, tail_item_count): item_num_of_id = dict() for values in self.rating_list: itemID = values[1] if not item_num_of_id.has_key(itemID): item_num_of_id[itemID] = len(item_num_of_id) item_num = item_num_of_id[itemID] if item_num>=head_item_count and item_num<tail_item_count: yield values def _extract_rating_matrix(self, rating_list): # XXX: make it to utils? # get specific rating matrix from given rating list(one rating per line) col_num_of_item = [-1 for i in range(self.ITEM_NUM)] # we may not have all items, so compression is needed item_id_of_col = [] rating_matrix = np.zeros([self.USER_NUM, self.ITEM_NUM]) item_count = 0 for values in rating_list: userID = values[0] itemID = values[1] if col_num_of_item[itemID] < 0: item_count += 1 col_num_of_item[itemID] = item_count-1 item_id_of_col.append(itemID) rating = values[2] rating_matrix[userID, col_num_of_item[itemID]] = rating rating_matrix = sp.coo_matrix(rating_matrix[:, :item_count]).tolil() # item_count may not reach max_item_num print 'rating matrix is {0}*{1}'.format(self.USER_NUM, item_count) return rating_matrix, item_id_of_col, col_num_of_item