def __init__(self, filename=None):
     """
         :param filename: if provided, cache parameters trained
     """
     self.filename = filename
     self.rmat = None
     self.users = None
     self.items = None
     self.log = LogUtil.getLogger(self.__class__.__name__)
     super(Predictor, self).__init__()
Example #2
0
def get_data(filename, columns, delimiter='::'):
    """
    :param filename: path of data source
    :param columns: column name for each column
    :param delimiter: delimiter to split a line
    :return: dataframe
    """
    log = LogUtil.getLogger('get_data')
    clock = Timer()
    with open(os.path.join(filename), 'r') as infile:
        data = infile.readlines()
        df = pd.DataFrame([row.rstrip().split(delimiter) for row in data],
                          columns=columns)

    e0 = clock.restart()
    log.info("loading data from %s with columns %s takes %.3f secs  ",
             filename, columns, e0)
    return df
Example #3
0
def train_test_split(ratings, frac=0.1, group='user', seed=1):
    """
        split data into train and test by frac
        if group is provide, split date into train and test by frac in each group
    """
    log = LogUtil.getLogger('train_test_split')
    log.info("start splitting test and train data ...")
    clock = Timer()
    if group:
        ratings_test = ratings.groupby(group).apply(
            lambda x: x.sample(frac=frac, random_state=seed))
        ratings_test.index = ratings_test.index.droplevel(group)
    else:
        ratings_test = ratings.sample(frac=frac, random_state=seed)

    ratings_train = pd.merge(ratings,
                             ratings_test,
                             indicator=True,
                             how='outer').query('_merge=="left_only"').drop(
                                 '_merge', axis=1)

    e0 = clock.restart()
    log.info("splitting test and train data takes %.3f secs", e0)
    return ratings_train, ratings_test
Example #4
0
from recsys.cf.funksvd import FunkSVD
from recsys.utils.data import train_test_split, load_movielen_data
from recsys.utils.debug import LogUtil

LogUtil.configLog()
model = FunkSVD(learning_rate=0.001, reg=0.005, n_epochs=100, n_factors=30)
ratings, users, movies = load_movielen_data()

training, testing = train_test_split(ratings)
model.fit(training)
model.eval(testing)
 def __init__(self, **kwargs):
     self.rmat = None
     self.users = None
     self.items = None
     self.log = LogUtil.getLogger(self.__class__.__name__)
     super(BaseSelector, self).__init__(**kwargs)