def train(self, read_article_ids=None, unread_article_ids=None): """ Trains the Bayes Classifier. read_article_ids should be an iterable over read article ids unread_article_ids should be an iterable over unread article ids If one is None it will be loaded from database. """ #Load user feedback if needed if read_article_ids is None: read_article_ids = set(r.article.id for r in ReadArticleFeedback.objects(user_id=self.user.id).only("article")) else: read_article_ids = set(read_article_ids) logger.info("Use %d read articles for learning." % len(read_article_ids)) read_articles = Article.objects(id__in=read_article_ids) #Get all articles the user did not read. if unread_article_ids is None: ranked_article_ids = (a.article.id for a in RankedArticle.objects(user_id=self.user.id).only("article")) all_article_ids = set(a.id for a in Article.objects(id__in=ranked_article_ids).only("id")) unread_article_ids = all_article_ids - read_article_ids #undersample unreads logger.info("Use %d unread articles for learning." % (len(unread_article_ids))) unread_articles = Article.objects(id__in=unread_article_ids) #convert all article features all_articles = UserModelBayes.AllArticles(read_articles, unread_articles, self.get_features) self.clf.fit(np.array(list(all_articles)), np.array(list(all_articles.get_marks())))
def test_rank(self): self.trainer.train() unread_doc = Article.objects(headline = u"Sony = Bad").first() read_doc = Article.objects(headline = u"Apple").first() rank_unread_doc = self.trainer.rank(unread_doc) rank_read_doc = self.trainer.rank(read_doc) self.assertEqual(rank_unread_doc, UserModelBayes.UNREAD) self.assertEqual(rank_read_doc, UserModelBayes.READ)
def train(self, read_article_ids = None, unread_article_ids = None): #Load user feedback if needed if read_article_ids is None: read_article_ids = (r.article.id for r in ReadArticleFeedback.objects(user_id = self.user.id).only("article")) user_feedback = Article.objects(id__in = read_article_ids) #TODO: cluster feedback articles and save more than one profile num_loaded_articles = 0 centroid = numpy.zeros(self.num_features_, dtype=numpy.float32) for article in user_feedback: try: article_features_as_full_vec = self.get_features(article) except Exception as inst: logger.error("Could not get features for article %s: %s" % (article.id, inst)) continue #do we need this? tmp_doc = matutils.unitvec(article_features_as_full_vec) #add up tmp_doc centroid = numpy.add(centroid, tmp_doc) num_loaded_articles += 1 #average each element if num_loaded_articles != 0: centroid = centroid / num_loaded_articles centroid = matutils.full2sparse(centroid) #set user model data self.user_model_features = [centroid]
def get_article_samples(config_): #Connect to mongo database logger.info("Connect to database...") connect(config_['database']['db-name'], username= config_['database']['user'], password= config_['database']['passwd'], port = config_['database']['port']) #get user user = User.objects(email=u"*****@*****.**").first() ranked_article_ids = (a.article.id for a in RankedArticle.objects(user_id = user.id).only("article")) all_article_ids = Set(a.id for a in Article.objects(id__in = ranked_article_ids).only("id")) read_article_ids = Set(a.article.id for a in ReadArticleFeedback.objects(user_id = user.id).only("article")) unread_article_ids = all_article_ids - read_article_ids #sample test articles X, y = get_samples(extractor, read_article_ids, unread_article_ids) return X, y
def train(self, read_article_ids=None, unread_article_ids=None): """ Trains the DecisionTree Classifier. read_article_ids should be an iterable over read article ids unread_article_ids should be an iterable over unread article ids If one is None it will be loaded from database. """ #Load user feedback if needed if read_article_ids is None: read_article_ids = set(r.article.id for r in ReadArticleFeedback.objects(user_id=self.user.id).only("article")) else: read_article_ids = set(read_article_ids) #Get all articles the user did not read. if unread_article_ids is None: ranked_article_ids = (a.article.id for a in RankedArticle.objects(user_id=self.user.id).only("article")) all_article_ids = set(a.id for a in Article.objects(id__in=ranked_article_ids).only("id")) unread_article_ids = all_article_ids - read_article_ids #convert all article features all_articles, marks = self._get_samples(read_article_ids, unread_article_ids, p_synthetic_samples=self.p_synthetic_samples, p_majority_samples=self.p_majority_samples) logger.debug("Learn on %d samples." % len(marks)) self.clf = tree.DecisionTreeClassifier() self.clf.fit(all_articles, marks)
def get_articles(self, date): ''' Returns list of articles between date 0:00 and date 24:00 ''' #use select_related = 2 to fetch all vendor data articles_ = Article.objects(vendor__in=current_user.mongodb_user.subscriptions, date__gte = date.date(), date__lt = date.date() + timedelta(days=1)).select_related(2) #mark articles as read/unread and add id field articles_as_dict = [] for a in articles_: #check in database if article has Read Feedback feedback = ReadArticleFeedback.objects(user_id = self.mongodb_user.id, article = a).first() tmp_article = a._data if feedback is None: tmp_article['read'] = False else: tmp_article['read'] = True tmp_article['id'] = a.id articles_as_dict.append(tmp_article) return articles_as_dict
def ajax_add_user(): ''' Called remotely to add a new user. ''' if not current_user.is_authenticated(): abort(403) name = request.form['name'] email = request.form['email'].lower() new_password = request.form['new_password'] new_password_repeat = request.form['new_password_repeat'] if current_user.mongodb_user.email != "*****@*****.**": abort(403) #check passwords if new_password != new_password_repeat: abort(400) if new_password == "": abort(400) #hash password m = hashlib.sha256() m.update(new_password.encode("UTF-8")) m.update(SALT.encode("UTF-8")) #check if user with email address already exists users_with_same_email = User.objects(email = email) if len(users_with_same_email) > 0: abort(400) try: app.logger.debug("Adding new user %s" % name) #just pick the first article as feedback first_article = Article.objects().first() first_profile = LearnedProfile(features = first_article.features) new_user = User(name = name, password = m.hexdigest(), email = email, learned_profile = [first_profile]) new_user.save(safe=True) first_feedback = ReadArticleFeedback(user_id = new_user.id, article = first_article, score = 1.0) first_feedback.save() app.logger.debug("...done.") except Exception as inst: app.logger.error("Could not add new user: %s: %s" % (type(inst), type)) abort(500) return ""
def get_top_articles(self, date, min_rating): ''' Returns iterator to articles from date and with a rating bigger than min_rating. ''' #get all articles from specific date articles_from_date = Article.objects(date__gte = date.date(), date__lt = date.date() + timedelta(days=1)) #get all ranked article form loaded articles return [a.article for a in RankedArticle.objects(user_id = self.mongodb_user.id, rating__gte = min_rating, article__in = articles_from_date)]
def read(key): try: article_ = Article.objects(id = key).first() except ValidationError as ve: app.logger.error("Error on reading %s (%s): %s" % (key, type(ve), ve)) article_ = None if article_ == None: return render_template('no_article.html', date=datetime.now()) #save user feedback current_user.save_read_article_feedback(article = article_, score = 1.0) #render read article view return render_template('read.html', article= article_, date=datetime.now())
def get_texts(self): ''' Files are processed parallel. See wikicorpus.py by Radim Rehurek ''' logger = logging.getLogger("feature_extractor") processed_articles = 0 for article in Article.objects(): if processed_articles % 1000 == 0: logger.info("Processing article #%d..." % processed_articles) processed_articles += 1 try: doc = article.clean_content tokens = utils.lemmatize(doc) yield tokens except Exception as e: logger.error("Could not process article %s (%s): %s" % (article.id, type(e), e)) logger.info("Processed %d articles." % processed_articles)
def get_samples(extractor, read_article_ids, unread_article_ids, p_synthetic_samples = 300, p_majority_samples = 500, k = 5): ''' read_article_ids : Set unread_article_ids : Set n_synthetic_samples : Percentage of snythetic samples, 300 for 300% k : neighbourhood for k nearest neighbour, standard 5 Returns ------- array-like full vector samples, shape = [n_features, n_samples] array-like marks, shape = [n_samples] ''' #Under-sample unread ids unread_article_ids = Set(sample(unread_article_ids, min(p_majority_samples/100 * len(read_article_ids), len(unread_article_ids)) ) ) #Create unread article vectors unread_marks = np.empty(len(unread_article_ids)) unread_marks.fill(UNREAD) unread_articles = np.empty(shape=(len(unread_article_ids), extractor.get_feature_number())) for i, article in enumerate(Article.objects(id__in = unread_article_ids)): try: article_features_as_full_vec = get_features(article, extractor) unread_articles[i,:] = article_features_as_full_vec[:] except AttributeError as e: logger.error("Article %s does not have attribute: %s." % (article.id, e)) #Create read article vectors read_marks = np.empty(len(read_article_ids)) read_marks.fill(READ) read_articles = np.empty(shape=(len(read_article_ids), extractor.get_feature_number())) for i, article in enumerate(Article.objects(id__in = read_article_ids)): try: article_features_as_full_vec = get_features(article, extractor) read_articles[i,:] = article_features_as_full_vec[:] except AttributeError as e: logger.error("Article %s does not have attribute: %s." % (article.id, e)) #SMOTE sample minorities #synthetic_read_articles = SMOTE(read_articles, p_synthetic_samples, k) #borderlineSMOTE sample minorites X = np.concatenate((read_articles, unread_articles)) y = np.concatenate((read_marks, unread_marks)) new_read_articles, synthetic_read_articles, danger_read_articles = borderlineSMOTE(X = X, y = y, minority_target = READ, N = p_synthetic_samples, k = k) #Create synthetic read samples synthetic_marks = np.zeros(len(synthetic_read_articles)) synthetic_marks.fill(READ) read_marks = np.empty(len(new_read_articles)) read_marks.fill(READ) danger_read_marks = np.empty(len(danger_read_articles)) danger_read_marks.fill(READ) logger.info("Use %d read, %d unread, %d danger reads and %d synthetic samples." % (len(read_marks), len(unread_marks), len(danger_read_marks), len(synthetic_marks))) return (np.concatenate((new_read_articles, synthetic_read_articles, danger_read_articles, unread_articles)), np.concatenate((read_marks, synthetic_marks, danger_read_marks, unread_marks)) )
password=config_["database"]["passwd"], port=config_["database"]["port"], ) # Load feature extractor # feature_extractor = EsaFeatureExtractor(prefix = config_['prefix']) # feature_extractor = TfidfFeatureExtractor(prefix = config_['prefix']) # feature_extractor = LdaFeatureExtractor(prefix = config_['prefix']) # feature_extractor = LdaBowFeatureExtractor(prefix = config_['prefix']) feature_extractor = cEsaFeatureExtractor(prefix=config_["prefix"]) # get user user = User.objects(email=u"*****@*****.**").first() ranked_article_ids = (a.article.id for a in RankedArticle.objects(user_id=user.id).only("article")) all_article_ids = set(a.id for a in Article.objects(id__in=ranked_article_ids).only("id")) read_article_ids = set(a.article.id for a in ReadArticleFeedback.objects(user_id=user.id).only("article")) unread_article_ids = all_article_ids - read_article_ids for p_synthetic in xrange(100, 700, 100): for p_majority in xrange(100, 700, 100): logger.info("Synthetic over-sampling %d and majority undersampling %d" % (p_synthetic, p_majority)) # run test N_ITERATIONS precisions_read = np.zeros((N_ITERATIONS)) recalls_read = np.zeros((N_ITERATIONS)) f1_scores_read = np.zeros((N_ITERATIONS)) precisions_unread = np.zeros((N_ITERATIONS))
#Load feature extractor #feature_extractor = EsaFeatureExtractor(prefix = config_['prefix']) #feature_extractor = TfidfFeatureExtractor(prefix = config_['prefix']) #feature_extractor = LdaFeatureExtractor(prefix = config_['prefix']) #feature_extractor = LdaBowFeatureExtractor(prefix = config_['prefix']) feature_extractor = cEsaFeatureExtractor(prefix = config_['prefix']) #get user user = User.objects(email=u"*****@*****.**").first() ranked_article_ids = (a.article.id for a in RankedArticle.objects(user_id = user.id).only("article")) all_article_ids = Set(a.id for a in Article.objects(id__in = ranked_article_ids).only("id")) read_article_ids = Set(a.article.id for a in ReadArticleFeedback.objects(user_id = user.id).only("article")) unread_article_ids = all_article_ids - read_article_ids for p_synthetic in xrange(100, 700, 100): for p_majority in xrange(100, 700, 100): logger.info("Synthetic over-sampling %d and majority undersampling %d" % (p_synthetic, p_majority)) #run test N_ITERATIONS precisions_read = np.zeros((N_ITERATIONS))
def test_constructor_with_file_wikicorpus(self): #load tf-idf model tfidf_model = tfidfmodel.TfidfModel.load("/media/sdc1/test_dump/result/test_tfidf.model") extractor = TfidfFeatureExtractor("/media/sdc1/test_dump/result/test") #load tf-idf corpus tfidf_corpus = MmCorpus('/media/sdc1/test_dump/result/test_tfidf_corpus.mm') #load lda corpus #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm') #load dictionary id2token = Dictionary.load("/media/sdc1/test_dump/result/test_wordids.dict") #load article titles document_titles = DocumentTitles.load("/media/sdc1/test_dump/result/test_articles.txt") #Connect to mongo database connect(self.config_['database']['db-name'], username= self.config_['database']['user'], password= self.config_['database']['passwd'], port = self.config_['database']['port']) #Load articles as test corpus user = User.objects(email=u"*****@*****.**").first() ranked_article_ids = (a.article.id for a in RankedArticle.objects(user_id = user.id).only("article")) all_article_ids = Set(a.id for a in Article.objects(id__in = ranked_article_ids).only("id")) read_article_ids = Set(a.article.id for a in ReadArticleFeedback.objects(user_id = user.id).only("article")) unread_article_ids = all_article_ids - read_article_ids #sample test articles X, y = get_samples(extractor, read_article_ids, unread_article_ids) s,f = X.shape logger.debug("Traning with %d samples, %d features, %d marks" % (s,f, len(y))) #train esa model esa_model = CosineEsaModel(tfidf_corpus, document_titles = document_titles, test_corpus = X, test_corpus_targets = y, num_test_corpus = len(y), num_best_features = 15, num_features = len(id2token)) print esa_model esa_model.save('/media/sdc1/test_dump/result/test_cesa.model') tmp_esa = CosineEsaModel.load('/media/sdc1/test_dump/result/test_cesa.model') print tmp_esa
def _get_samples(self, read_article_ids, unread_article_ids, p_synthetic_samples = 300, p_majority_samples = 500, k = 5): ''' read_article_ids : Set unread_article_ids : Set p_synthetic_samples : Percentage of snythetic samples, 300 for 300% If None no are created p_majority_samples : Size of majority sample = p_majority_samples/n_minority_sample, 500 for 500% If None under sampling ist not done k : neighbourhood for k nearest neighbour, standard 5 Returns ------- array-like full vector samples, shape = [n_features, n_samples] array-like marks, shape = [n_samples] ''' #Under-sample unread ids if p_majority_samples is not None: unread_article_ids = Set(sample(unread_article_ids, min(p_majority_samples/100 * len(read_article_ids), len(unread_article_ids)) ) ) #Create unread article vectors unread_marks = numpy.empty(len(unread_article_ids)) unread_marks.fill(UserModelSVM.UNREAD) unread_articles = numpy.empty(shape=(len(unread_article_ids), self.num_features_)) for i, article in enumerate(Article.objects(id__in = unread_article_ids)): try: article_features_as_full_vec = self.get_features(article) unread_articles[i,:] = article_features_as_full_vec[:] except AttributeError as e: logger.error("Article %s does not have attribute: %s." % (article.id, e)) #Create read article vectors read_marks = numpy.empty(len(read_article_ids)) read_marks.fill(UserModelSVM.READ) read_articles = numpy.empty(shape=(len(read_article_ids), self.num_features_)) for i, article in enumerate(Article.objects(id__in = read_article_ids)): try: article_features_as_full_vec = self.get_features(article) read_articles[i,:] = article_features_as_full_vec[:] except AttributeError as e: logger.error("Article %s does not have attribute: %s." % (article.id, e)) #SMOTE sample minorities #synthetic_read_articles = SMOTE(read_articles, p_synthetic_samples, k) #borderlineSMOTE sample minorites if p_synthetic_samples not None X = numpy.concatenate((read_articles, unread_articles)) self._calculate_mean_and_std_deviation(X) X = self._normalize(X) y = numpy.concatenate((read_marks, unread_marks)) if p_synthetic_samples is None: return X, y else: new_read_articles, synthetic_read_articles, danger_read_articles = borderlineSMOTE(X = X, y = y, minority_target = UserModelSVM.READ, N = p_synthetic_samples, k = k) #Create synthetic read samples synthetic_marks = numpy.zeros(len(synthetic_read_articles)) synthetic_marks.fill(UserModelSVM.READ) read_marks = numpy.empty(len(new_read_articles)) read_marks.fill(UserModelSVM.READ) danger_read_marks = numpy.empty(len(danger_read_articles)) danger_read_marks.fill(UserModelSVM.READ) logger.info("Use %d read, %d unread, %d danger reads and %d synthetic samples." % (len(read_marks), len(unread_marks), len(danger_read_marks), len(synthetic_marks))) return (numpy.concatenate((new_read_articles, synthetic_read_articles, danger_read_articles, unread_articles)), numpy.concatenate((read_marks, synthetic_marks, danger_read_marks, unread_marks)) )
def train(self, read_article_ids = None, unread_article_ids = None): ''' Trains the several SVM and Naive Bayes Classifiers. read_article_ids should be an iterable over read article ids unread_article_ids should be an iterable over unread article ids If one is None it will be loaded from database. ''' #Load user feedback if needed if read_article_ids is None: read_article_ids = Set(r.article.id for r in ReadArticleFeedback.objects(user_id = self.user.id).only("article")) else: read_article_ids = Set(read_article_ids) #Get all articles the user did not read. if unread_article_ids is None: ranked_article_ids = (a.article.id for a in RankedArticle.objects(user_id = self.user.id).only("article")) all_article_ids = Set(a.id for a in Article.objects(id__in = ranked_article_ids).only("id")) unread_article_ids = all_article_ids - read_article_ids classifiers = [lambda: svm.SVC(kernel='rbf'), lambda: svm.SVC(kernel='rbf'), lambda: svm.SVC(kernel='rbf'), lambda: svm.SVC(kernel='rbf'), lambda: svm.SVC(kernel='rbf'), GaussianNB, GaussianNB, GaussianNB, GaussianNB] parameters = [#SVM {'read_article_ids': read_article_ids, 'unread_article_ids': unread_article_ids, 'p_synthetic_samples': 100, 'p_majority_samples': 200, 'k': 10}, {'read_article_ids': read_article_ids, 'unread_article_ids': unread_article_ids, 'p_synthetic_samples': 200, 'p_majority_samples': 300, 'k': 10}, {'read_article_ids': read_article_ids, 'unread_article_ids': unread_article_ids, 'p_synthetic_samples': 300, 'p_majority_samples': 400, 'k': 10}, {'read_article_ids': read_article_ids, 'unread_article_ids': unread_article_ids, 'p_synthetic_samples': 400, 'p_majority_samples': 500, 'k': 10}, {'read_article_ids': read_article_ids, 'unread_article_ids': unread_article_ids, 'p_synthetic_samples': 500, 'p_majority_samples': 600, 'k': 10}, #Naive Bayes {'read_article_ids': read_article_ids, 'unread_article_ids': unread_article_ids, 'p_synthetic_samples': 100, 'p_majority_samples': 100, 'k': 10}, {'read_article_ids': read_article_ids, 'unread_article_ids': unread_article_ids, 'p_synthetic_samples': 100, 'p_majority_samples': 200, 'k': 10}, {'read_article_ids': read_article_ids, 'unread_article_ids': unread_article_ids, 'p_synthetic_samples': 300, 'p_majority_samples': 500, 'k': 10}, {'read_article_ids': read_article_ids, 'unread_article_ids': unread_article_ids, 'p_synthetic_samples': 600, 'p_majority_samples': 600, 'k': 10}] self._call_classifiers(classifiers, parameters)
#Connect to mongo database try: connect(config['database']['db-name'], username= config['database']['user'], password= config['database']['passwd'], port = config['database']['port']) except connection.ConnectionError as e: logger.error("Could not connect to mongodb: %s" % e) sys.exit(1) feature_extractor = EsaFeatureExtractor(prefix = config['prefix'] ) #go through each article and convert features count = 0 for article in Article.objects(features__version__ne = feature_extractor.get_version()): if count % 10 == 0: logger.info("PROGRESS: processing article #%d" % count) count += 1 if article.features.version == EsaFeatureExtractor.get_version(): continue clean_content = article.clean_content #get new features new_features = feature_extractor.get_features(clean_content) #save new features features = Features(version = feature_extractor.get_version(), data = new_features) article.features = features