Example #1
0
import logging
import numpy as np
import scipy.sparse as sp
import sys
from pprint import pformat

from lightfm import LightFM

import experiments
from experiments.cf_model import CFModel
from experiments.lsiup_model import LsiUpModel
from experiments.modelutils import fit_model
from experiments.stackexchange.data import read_post_features, read_interactions, read_user_features


logger = experiments.getLogger("experiments.stackexchange.model")


def read_data(tags, post_ids, post_text, about, sampled_negatives_ratio):

    logger.debug("Reading features")
    features = read_post_features(tags, post_ids, post_text)
    item_features_matrix = features.mat.tocoo().tocsr()

    logger.debug("Reading interactions")
    interactions = read_interactions(features.item_ids)
    interactions.fit(min_positives=1, sampled_negatives_ratio=sampled_negatives_ratio)

    user_features = read_user_features(about=about, user_ids=True, user_id_mapping=interactions.user_ids)

    logger.debug(
Example #2
0
import codecs
from HTMLParser import HTMLParser
from lxml import etree
import os
import re

import experiments
from experiments.data import Features, Interactions

# Data description here: http://meta.stackexchange.com/questions/2677/database-schema-documentation-for-the-public-data-dump-and-sede
logger = experiments.getLogger('experiments.stackexchange.model')

DATA_DIR = os.path.dirname(__file__)

QUESTION_POST_TYPE_ID = 1
QUESTION_POST_TYPE_ANSWER = 2


class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []

    def handle_data(self, d):
        self.fed.append(d)

    def get_data(self):
        return ''.join(self.fed)


def strip_tags(html):
Example #3
0
import logging
import numpy as np
from pprint import pformat
import scipy.sparse as sp
import sys

from lightfm import LightFM

import experiments
from experiments.cf_model import CFModel
from experiments.lsiup_model import LsiUpModel
from experiments.modelutils import fit_model
from experiments.movielens.data import read_movie_features, read_interaction_data


logger = experiments.getLogger('experiments.movielens.model')


def read_data(titles, genres,
              genome_tag_threshold,
              positive_threshold):

    logger.debug('Reading features')
    features = read_movie_features(titles=titles, genres=genres, genome_tag_threshold=genome_tag_threshold)
    item_features_matrix = features.mat.tocoo().tocsr()

    logger.debug('Reading interactions')
    interactions = read_interaction_data(features.item_ids,
                                         positive_threshold=positive_threshold)
    interactions.fit(min_positives=1, sampled_negatives_ratio=0, use_observed_negatives=True)