def fit(self, table): stemmed_text = stem(table.text) self.tfidf = TfidfVectorizer(stop_words='english', max_features=self.max_features) text_features = self.tfidf.fit_transform(stemmed_text) text_features = text_features.toarray() self.pca = PCA(150).fit(text_features) self.avg_user = CategoryAverage() self.avg_user = self.avg_user.fit(table.user_id, log(table.votes_useful + 1)) # scale for votes votes = log(table.votes_useful + 1) if self.use_scale: self.scale = 1 / max(votes) else: self.scale = 1 return self