def gen_feature_vec(review, useful_bools, useful_ratios, biz_review_counts, top100, threshold): text = review['text'] words = set(flesch_kincaid.words(text.lower())) useful = useful_bools[review['id']] length = len(text) reading_ease, reading_level = flesch_kincaid.results(text) rating = review['stars'] useful_ratio = useful_ratios[ review['user_id']] if review['user_id'] in useful_ratios else 0 biz_review_count = biz_review_counts[review['business_id']] features = [ useful, length, reading_ease, reading_level, rating, useful_ratio, biz_review_count ] for word in top100: features.append(1 if word in words else 0) #f = features #interaction_features = [f[i] * f[j] for i in range(len(f)) for j in range(i + 1, len(f))] #polynomial_features = [x ** y for x in features for y in range(2, 5)] #features.extend(interaction_features) #features.extend(polynomial_features) return [str(x) for x in features]
def gen_feature_vec(review, useful_bools, useful_ratios, biz_review_counts, top100, threshold): text = review['text'] words = set(flesch_kincaid.words(text.lower())) useful = useful_bools[review['id']] length = len(text) reading_ease, reading_level = flesch_kincaid.results(text) rating = review['stars'] useful_ratio = useful_ratios[review['user_id']] if review['user_id'] in useful_ratios else 0 biz_review_count = biz_review_counts[review['business_id']] features = [ useful, length, reading_ease, reading_level, rating, useful_ratio, biz_review_count ] for word in top100: features.append(1 if word in words else 0) #f = features #interaction_features = [f[i] * f[j] for i in range(len(f)) for j in range(i + 1, len(f))] #polynomial_features = [x ** y for x in features for y in range(2, 5)] #features.extend(interaction_features) #features.extend(polynomial_features) return [str(x) for x in features]
''').fetchall() biz_review_counts = {k: v for k, v in biz_review_counts} reviews = c.execute('SELECT * FROM review').fetchall() for USEFUL_THRESHOLD in [1, 3, 5]: freq = defaultdict(int) useful_reviews = c.execute( '''SELECT r.text FROM review AS r WHERE r.useful >= ? ''', (USEFUL_THRESHOLD, )).fetchall() for review in useful_reviews: for word in flesch_kincaid.words(review['text'].lower()): freq[word] += 1 top100 = sorted(freq.iteritems(), key=lambda item: -item[1]) top100 = [k for k, v in top100 if k not in common_words.WORDS][:100] print(top100, len(top100)) useful_bools = c.execute( '''SELECT r.id, r.useful >= ? FROM review AS r ''', (USEFUL_THRESHOLD, )).fetchall() useful_bools = {k: v for k, v in useful_bools} useful_ratios = c.execute( '''SELECT u.user_id,
GROUP BY b.business_id ''').fetchall() biz_review_counts = {k:v for k,v in biz_review_counts} reviews = c.execute('SELECT * FROM review').fetchall() for USEFUL_THRESHOLD in [1, 3, 5]: freq = defaultdict(int) useful_reviews = c.execute('''SELECT r.text FROM review AS r WHERE r.useful >= ? ''', (USEFUL_THRESHOLD,)).fetchall() for review in useful_reviews: for word in flesch_kincaid.words(review['text'].lower()): freq[word] += 1 top100 = sorted(freq.iteritems(), key=lambda item: -item[1]) top100 = [k for k, v in top100 if k not in common_words.WORDS][:100] print(top100, len(top100)) useful_bools = c.execute('''SELECT r.id, r.useful >= ? FROM review AS r ''', (USEFUL_THRESHOLD,)).fetchall() useful_bools = {k:v for k,v in useful_bools} useful_ratios = c.execute('''SELECT u.user_id, 1.0 * SUM(CASE WHEN r.useful >= ? THEN 1 ELSE 0 END) / COUNT(*) FROM user AS u, review AS r