def test_score(self): u_words = {'pancakes': 3, 'syrup': 12, 'communism': -10} a_words = {'communism': 0.2, 'syrup': 0.1} sticky_marxism = score(a_words, u_words) a_words2 = {'pancakes': 0.2, 'syrup': 0.1} trip_to_canada = score(a_words2, u_words) self.assertTrue(sticky_marxism < trip_to_canada) a_words = {'marxism': 0.2, 'sugar': 0.1} sticky_marxism = score(a_words, u_words) a_words2 = {'cake': 0.2, 'honey': 0.1} trip_to_canada = score(a_words2, u_words) self.assertTrue(sticky_marxism < trip_to_canada)
def main(): gearman_client = gearman.GearmanClient(['localhost:4730']) if len(sys.argv) < 2: print('Please specify a user to get the error of. See README.md') return username = get_username_from_input(sys.argv) print 'Getting model error of {}'.format(username) print 'Loading user\'s votes from database' # get the user's votes on articles db_result = db_get('vote', {'username': username}, { 'article_url': 1, 'feed_url': 1, 'positive_opinion': 1, 'vote_datetime': 1 }) if db_result['status'] != 'ok': print 'Error' print 'Could not get user data from vote collection' print db_result['description'] return articles = db_result['docs'] print len(articles), 'article opinions found in vote db for given user' # map each article url to 1 or -1, if the user liked or disliked it article_opinions = {} vote_datetimes = {} for article in articles: # make sure all the required fields are there req_fields = ['article_url', 'positive_opinion', 'vote_datetime'] if not all([s in article for s in req_fields]): print 'Error' print 'Vote is missing some fields: {}'.format(article) continue url = article['article_url'] # set the classes for the votes to 1 for positive and -1 for negative vote = 1 if article['positive_opinion'] else -1 article_opinions[url] = vote vote_datetimes[url] = article['vote_datetime'] # split the articles into the feeds they belong to, to minimise db lookups # the dict maps feed urls to a list of article urls fromt that feed feeds = {} for article in articles: if article['feed_url'] in feeds: feeds[article['feed_url']].append(article['article_url']) else: feeds[article['feed_url']] = [article['article_url']] # get a set of the unique article urls article_url_set = set(article_opinions.keys()) print len(article_url_set), 'unique articles in set' if len(article_url_set) < 0: print 'Error' print 'Not enough articles in data set' return # get the words the user is interested in db_result = db_get('user', {'username': username}, {'words': 1}) if db_result['status'] != 'ok': print 'Error' print 'Could not load data from user collection' print db_result['description'] return if len(db_result['docs']) < 1: print 'Error' print 'No such user in user collection' return user_data = db_result['docs'][0] user_words = user_data['words'] # it is required to have at least classes, so create two # inputs with extreme values to train the model data_x = [[10.0, 1], [0.0, 10000000]] data_y = [1, -1] # get the data from the db for each feed a user voted on an article in for feed in feeds: db_result = db_get('feed', {'url': feed}, {'items': 1}) if db_result['status'] != 'ok': print 'Error' print 'Could not get data from feed collection' print db_result['description'] return if 'docs' not in db_result or len(db_result['docs']) < 1: print 'Error' print 'No feed returned for url', feed return items = db_result['docs'][0]['items'] # check the items in that feed for one the user voted on for item in items: if item['link'] not in article_url_set: continue print 'adding', item['link'] if 'topics' not in item: print 'Error' print 'No topics for given item, skipping' continue words = item['topics'] topic_crossover = kw_score.score(words, user_words) if 'pub_date' not in item: print 'Error' print 'No pub_date for given item, skipping' continue time_diff = vote_datetimes[item['link']] - item['pub_date'] x = [topic_crossover, time_diff.total_seconds()] y = article_opinions[item['link']] data_x.append(x) data_y.append(y) print 'Articles from feed', feed, 'added to data' print data_x print data_y if len(data_x) < 3: print 'Error' print 'Not enough data points' return data_points = [(data_x[i], data_y[i]) for i in xrange(len(data_x))] n = 0 score = 0 # start the 2-fold cross-validation, doing up to 10 folds of the data repetitions = min(len(data_points), 10) for k in xrange(repetitions): print 'Iteration {} out of {} ({}% finished)'.format( k, len(data_points), 100 * (float(k) / repetitions)) shuffle(data_points) training = data_points[:len(data_points) / 2] validation = data_points[len(data_points) / 2:] if has_enough_classes(training): curr_score = get_model_score(training, validation) print '- Score 1 this fold: {}'.format(curr_score) score += curr_score n += 1 else: print '- Not enough training classes, skipping' continue #swap the training and validation data training, validation = validation, training if has_enough_classes(training): curr_score = get_model_score(training, validation) print '- Score 2 this fold: {}'.format(curr_score) score += curr_score n += 1 else: print '- Not enough training classes, skipping' continue if n == 0: print 'Error' print 'Not enough valid data points' return print 'Score: {:.6f}, based on {} divisions of the data.'.format( float(score) / n, n) return
def main(): gearman_client = gearman.GearmanClient(['localhost:4730']) if len(sys.argv) < 2: print('Please specify a user to get the error of. See README.md') return username = get_username_from_input(sys.argv) print 'Getting model error of {}'.format(username) print 'Loading user\'s votes from database' # get the user's votes on articles db_result = db_get('vote', { 'username': username },{ 'article_url': 1, 'feed_url': 1, 'positive_opinion': 1, 'vote_datetime': 1 }) if db_result['status'] != 'ok': print 'Error' print 'Could not get user data from vote collection' print db_result['description'] return articles = db_result['docs'] print len(articles), 'article opinions found in vote db for given user' # map each article url to 1 or -1, if the user liked or disliked it article_opinions = {} vote_datetimes = {} for article in articles: # make sure all the required fields are there req_fields = ['article_url', 'positive_opinion', 'vote_datetime'] if not all([s in article for s in req_fields]): print 'Error' print 'Vote is missing some fields: {}'.format(article) continue url = article['article_url'] # set the classes for the votes to 1 for positive and -1 for negative vote = 1 if article['positive_opinion'] else -1 article_opinions[url] = vote vote_datetimes[url] = article['vote_datetime'] # split the articles into the feeds they belong to, to minimise db lookups # the dict maps feed urls to a list of article urls fromt that feed feeds = {} for article in articles: if article['feed_url'] in feeds: feeds[article['feed_url']].append(article['article_url']) else: feeds[article['feed_url']] = [article['article_url']] # get a set of the unique article urls article_url_set = set(article_opinions.keys()) print len(article_url_set), 'unique articles in set' if len(article_url_set) < 0: print 'Error' print 'Not enough articles in data set' return # get the words the user is interested in db_result = db_get('user', { 'username': username }, { 'words': 1 }) if db_result['status'] != 'ok': print 'Error' print 'Could not load data from user collection' print db_result['description'] return if len(db_result['docs']) < 1: print 'Error' print 'No such user in user collection' return user_data = db_result['docs'][0] user_words = user_data['words'] # it is required to have at least classes, so create two # inputs with extreme values to train the model data_x = [[10.0, 1], [0.0, 10000000]] data_y = [1, -1] # get the data from the db for each feed a user voted on an article in for feed in feeds: db_result = db_get('feed', { 'url': feed }, { 'items': 1 }) if db_result['status'] != 'ok': print 'Error' print 'Could not get data from feed collection' print db_result['description'] return if 'docs' not in db_result or len(db_result['docs']) < 1: print 'Error' print 'No feed returned for url', feed return items = db_result['docs'][0]['items'] # check the items in that feed for one the user voted on for item in items: if item['link'] not in article_url_set: continue print 'adding', item['link'] if 'topics' not in item: print 'Error' print 'No topics for given item, skipping' continue words = item['topics'] topic_crossover = kw_score.score(words, user_words) if 'pub_date' not in item: print 'Error' print 'No pub_date for given item, skipping' continue time_diff = vote_datetimes[item['link']] - item['pub_date'] x = [topic_crossover, time_diff.total_seconds()] y = article_opinions[item['link']] data_x.append(x) data_y.append(y) print 'Articles from feed', feed, 'added to data' print data_x print data_y if len(data_x) < 3: print 'Error' print 'Not enough data points' return data_points = [(data_x[i], data_y[i]) for i in xrange(len(data_x))] n = 0 score = 0 # start the 2-fold cross-validation, doing up to 10 folds of the data repetitions = min(len(data_points), 10) for k in xrange(repetitions): print 'Iteration {} out of {} ({}% finished)'.format(k, len(data_points), 100*(float(k)/repetitions)) shuffle(data_points) training = data_points[:len(data_points)/2] validation = data_points[len(data_points)/2:] if has_enough_classes(training): curr_score = get_model_score(training, validation) print '- Score 1 this fold: {}'.format(curr_score) score += curr_score n += 1 else: print '- Not enough training classes, skipping' continue #swap the training and validation data training, validation = validation, training if has_enough_classes(training): curr_score = get_model_score(training, validation) print '- Score 2 this fold: {}'.format(curr_score) score += curr_score n += 1 else: print '- Not enough training classes, skipping' continue if n == 0: print 'Error' print 'Not enough valid data points' return print 'Score: {:.6f}, based on {} divisions of the data.'.format(float(score)/n, n) return