import logging import numpy as np import scipy.sparse as sp import sys from pprint import pformat from lightfm import LightFM import experiments from experiments.cf_model import CFModel from experiments.lsiup_model import LsiUpModel from experiments.modelutils import fit_model from experiments.stackexchange.data import read_post_features, read_interactions, read_user_features logger = experiments.getLogger("experiments.stackexchange.model") def read_data(tags, post_ids, post_text, about, sampled_negatives_ratio): logger.debug("Reading features") features = read_post_features(tags, post_ids, post_text) item_features_matrix = features.mat.tocoo().tocsr() logger.debug("Reading interactions") interactions = read_interactions(features.item_ids) interactions.fit(min_positives=1, sampled_negatives_ratio=sampled_negatives_ratio) user_features = read_user_features(about=about, user_ids=True, user_id_mapping=interactions.user_ids) logger.debug(
import codecs from HTMLParser import HTMLParser from lxml import etree import os import re import experiments from experiments.data import Features, Interactions # Data description here: http://meta.stackexchange.com/questions/2677/database-schema-documentation-for-the-public-data-dump-and-sede logger = experiments.getLogger('experiments.stackexchange.model') DATA_DIR = os.path.dirname(__file__) QUESTION_POST_TYPE_ID = 1 QUESTION_POST_TYPE_ANSWER = 2 class MLStripper(HTMLParser): def __init__(self): self.reset() self.fed = [] def handle_data(self, d): self.fed.append(d) def get_data(self): return ''.join(self.fed) def strip_tags(html):
import logging import numpy as np from pprint import pformat import scipy.sparse as sp import sys from lightfm import LightFM import experiments from experiments.cf_model import CFModel from experiments.lsiup_model import LsiUpModel from experiments.modelutils import fit_model from experiments.movielens.data import read_movie_features, read_interaction_data logger = experiments.getLogger('experiments.movielens.model') def read_data(titles, genres, genome_tag_threshold, positive_threshold): logger.debug('Reading features') features = read_movie_features(titles=titles, genres=genres, genome_tag_threshold=genome_tag_threshold) item_features_matrix = features.mat.tocoo().tocsr() logger.debug('Reading interactions') interactions = read_interaction_data(features.item_ids, positive_threshold=positive_threshold) interactions.fit(min_positives=1, sampled_negatives_ratio=0, use_observed_negatives=True)