def test_rank(self): self.trainer.train() unread_doc = Article.objects(headline=u"Sony = Bad").first() read_doc = Article.objects(headline=u"Apple").first() rank_unread_doc = self.trainer.rank(unread_doc) rank_read_doc = self.trainer.rank(read_doc) self.assertEqual(rank_unread_doc, UserModelBayes.UNREAD) self.assertEqual(rank_read_doc, UserModelBayes.READ)
def get_articles(self, date): """ Returns list of articles between date 0:00 and date 24:00 """ #use select_related = 2 to fetch all vendor data articles_ = Article.objects(vendor__in=current_user.mongodb_user.subscriptions, date__gte=date.date(), date__lt=date.date() + timedelta(days=1)).select_related(2) #mark articles as read/unread and add id field articles_as_dict = [] for a in articles_: #check in database if article has Read Feedback feedback = ReadArticleFeedback.objects(user_id=self.mongodb_user.id, article=a).first() tmp_article = a._data if feedback is None: tmp_article['read'] = False else: tmp_article['read'] = True tmp_article['id'] = a.id articles_as_dict.append(tmp_article) return articles_as_dict
def get_top_articles(self, date, min_rating): """ Returns iterator to articles from date and with a rating bigger than min_rating. """ #get all articles from specific date articles_from_date = Article.objects(date__gte=date.date(), date__lt=date.date() + timedelta(days=1)) #get all ranked article form loaded articles return [a.article for a in RankedArticle.objects(user_id=self.mongodb_user.id, rating__gte=min_rating, article__in=articles_from_date)]
def ajax_add_user(): """ Called remotely to add a new user. """ if not current_user.is_authenticated(): abort(403) name = request.form['name'] email = request.form['email'].lower() new_password = request.form['new_password'] new_password_repeat = request.form['new_password_repeat'] if current_user.mongodb_user.email != "*****@*****.**": abort(403) #check passwords if new_password != new_password_repeat: abort(400) if new_password == "": abort(400) #hash password m = hashlib.sha256() m.update(new_password.encode("UTF-8")) m.update(SALT.encode("UTF-8")) #check if user with email address already exists users_with_same_email = User.objects(email=email) if len(users_with_same_email) > 0: abort(400) try: app.logger.debug("Adding new user %s" % name) #just pick the first article as feedback first_article = Article.objects().first() first_profile = UserModel(features=first_article.features) new_user = User(name=name, password=m.hexdigest(), email=email, learned_profile=[first_profile]) new_user.save(safe=True) first_feedback = ReadArticleFeedback(user_id=new_user.id, article=first_article, score=1.0) first_feedback.save() app.logger.debug("...done.") except Exception as inst: app.logger.error("Could not add new user: %s: %s" % (type(inst), type)) abort(500) return ""
def read(key): try: article_ = Article.objects(id=key).first() except ValidationError as ve: app.logger.error("Error on reading %s (%s): %s" % (key, type(ve), ve)) article_ = None if article_ is None: return render_template('no_article.html', date=datetime.now()) #save user feedback current_user.save_read_article_feedback(article=article_, score=1.0) #render read article view return render_template('read.html', article=article_, date=datetime.now())
def test_constructor_with_file_wikicorpus(self): #load tf-idf model tfidf_model = tfidfmodel.TfidfModel.load("/vagrant/data/test_tfidf.model") extractor = TfidfFeatureExtractor("/vagrant/data/test") #load tf-idf corpus tfidf_corpus = MmCorpus('/vagrant/data/test_tfidf_corpus.mm') #load lda corpus #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm') #load dictionary id2token = Dictionary.load("/vagrant/data/test_wordids.dict") #load article titles document_titles = DocumentTitles.load("/vagrant/test_articles.txt") #Connect to mongo database connect(self.config_['database']['db-name'], username=self.config_['database']['user'], password=self.config_['database']['passwd'], port=self.config_['database']['port']) #Load articles as test corpus user = User.objects(email=u"*****@*****.**").first() ranked_article_ids = (a.article.id for a in RankedArticle.objects(user_id=user.id).only("article")) all_article_ids = set(a.id for a in Article.objects(id__in=ranked_article_ids).only("id")) read_article_ids = set(a.article.id for a in ReadArticleFeedback.objects(user_id=user.id).only("article")) unread_article_ids = all_article_ids - read_article_ids #sample test articles X, y = get_samples(extractor, read_article_ids, unread_article_ids) s,f = X.shape logger.debug("Training with %d samples, %d features, %d marks" % (s, f, len(y))) #train esa model esa_model = CosineEsaModel(tfidf_corpus, document_titles=document_titles, test_corpus=X, test_corpus_targets=y, num_test_corpus=len(y), num_best_features=15, num_features=len(id2token)) for line in esa_model: print repr(line) esa_model.save('/vagrant/data/test_cesa.model') tmp_esa = CosineEsaModel.load('/vagrant/data/test_cesa.model') print tmp_esa
def get_samples(extractor, read_article_ids, unread_article_ids, p_synthetic_samples=300, p_majority_samples=500, k=5): """ read_article_ids : Set unread_article_ids : Set n_synthetic_samples : Percentage of snythetic samples, 300 for 300% k : neighbourhood for k nearest neighbour, standard 5 Returns ------- array-like full vector samples, shape = [n_features, n_samples] array-like marks, shape = [n_samples] """ #Under-sample unread ids unread_article_ids = set(sample(unread_article_ids, min(p_majority_samples/100 * len(read_article_ids), len(unread_article_ids)) ) ) #Create unread article vectors unread_marks = np.empty(len(unread_article_ids)) unread_marks.fill(UNREAD) unread_articles = np.empty(shape=(len(unread_article_ids), extractor.get_feature_number())) for i, article in enumerate(Article.objects(id__in=unread_article_ids)): try: article_features_as_full_vec = get_features(article, extractor) unread_articles[i,:] = article_features_as_full_vec[:] except AttributeError as e: logger.error("Article %s does not have attribute: %s." % (article.id, e)) #Create read article vectors read_marks = np.empty(len(read_article_ids)) read_marks.fill(READ) read_articles = np.empty(shape=(len(read_article_ids), extractor.get_feature_number())) for i, article in enumerate(Article.objects(id__in = read_article_ids)): try: article_features_as_full_vec = get_features(article, extractor) read_articles[i,:] = article_features_as_full_vec[:] except AttributeError as e: logger.error("Article %s does not have attribute: %s." % (article.id, e)) #SMOTE sample minorities #synthetic_read_articles = SMOTE(read_articles, p_synthetic_samples, k) #borderlineSMOTE sample minorities X = np.concatenate((read_articles, unread_articles)) y = np.concatenate((read_marks, unread_marks)) new_read_articles, synthetic_read_articles, danger_read_articles = borderlineSMOTE(X=X, y=y, minority_target=READ, N=p_synthetic_samples, k=k) #Create synthetic read samples synthetic_marks = np.zeros(len(synthetic_read_articles)) synthetic_marks.fill(READ) read_marks = np.empty(len(new_read_articles)) read_marks.fill(READ) danger_read_marks = np.empty(len(danger_read_articles)) danger_read_marks.fill(READ) logger.info("Use %d read, %d unread, %d danger reads and %d synthetic samples." % (len(read_marks), len(unread_marks), len(danger_read_marks), len(synthetic_marks))) return (np.concatenate((new_read_articles, synthetic_read_articles, danger_read_articles, unread_articles)), np.concatenate((read_marks, synthetic_marks, danger_read_marks, unread_marks)) )