Example #1
0
    def test_rank(self):
        self.trainer.train()

        unread_doc = Article.objects(headline=u"Sony = Bad").first()
        read_doc = Article.objects(headline=u"Apple").first()

        rank_unread_doc = self.trainer.rank(unread_doc)
        rank_read_doc = self.trainer.rank(read_doc)

        self.assertEqual(rank_unread_doc, UserModelBayes.UNREAD)
        self.assertEqual(rank_read_doc, UserModelBayes.READ)
Example #2
0
    def get_articles(self, date):
        """
        Returns list of articles between date 0:00 and date 24:00
        """

        #use select_related = 2 to fetch all vendor data
        articles_ = Article.objects(vendor__in=current_user.mongodb_user.subscriptions,
                                    date__gte=date.date(),
                                    date__lt=date.date() + timedelta(days=1)).select_related(2)

        #mark articles as read/unread and add id field
        articles_as_dict = []
        for a in articles_:
            #check in database if article has Read Feedback
            feedback = ReadArticleFeedback.objects(user_id=self.mongodb_user.id, article=a).first()

            tmp_article = a._data

            if feedback is None:
                tmp_article['read'] = False
            else:
                tmp_article['read'] = True

            tmp_article['id'] = a.id

            articles_as_dict.append(tmp_article)

        return articles_as_dict
Example #3
0
    def get_top_articles(self, date, min_rating):
        """
        Returns iterator to articles from date and with a rating bigger than
        min_rating.
        """

        #get all articles from specific date
        articles_from_date = Article.objects(date__gte=date.date(), date__lt=date.date() + timedelta(days=1))

        #get all ranked article form loaded articles
        return [a.article for a in RankedArticle.objects(user_id=self.mongodb_user.id,
                                                         rating__gte=min_rating,
                                                         article__in=articles_from_date)]
Example #4
0
def ajax_add_user():
    """
    Called remotely to add a new user.
    """
    if not current_user.is_authenticated():
        abort(403)

    name = request.form['name']
    email = request.form['email'].lower()
    new_password = request.form['new_password']
    new_password_repeat = request.form['new_password_repeat']

    if current_user.mongodb_user.email != "*****@*****.**":
        abort(403)

    #check passwords
    if new_password != new_password_repeat:
        abort(400)

    if new_password == "":
        abort(400)

    #hash password
    m = hashlib.sha256()
    m.update(new_password.encode("UTF-8"))
    m.update(SALT.encode("UTF-8"))

    #check if user with email address already exists
    users_with_same_email = User.objects(email=email)
    if len(users_with_same_email) > 0:
        abort(400)

    try:
        app.logger.debug("Adding new user %s" % name)

        #just pick the first article as feedback
        first_article = Article.objects().first()
        first_profile = UserModel(features=first_article.features)

        new_user = User(name=name, password=m.hexdigest(), email=email, learned_profile=[first_profile])
        new_user.save(safe=True)

        first_feedback = ReadArticleFeedback(user_id=new_user.id, article=first_article, score=1.0)
        first_feedback.save()

        app.logger.debug("...done.")
    except Exception as inst:
        app.logger.error("Could not add new user: %s: %s" % (type(inst), type))
        abort(500)

    return ""
Example #5
0
def read(key):
    try:
        article_ = Article.objects(id=key).first()
    except ValidationError as ve:
        app.logger.error("Error on reading %s (%s): %s" % (key, type(ve), ve))
        article_ = None

    if article_ is None:
        return render_template('no_article.html', date=datetime.now())

    #save user feedback
    current_user.save_read_article_feedback(article=article_, score=1.0)

    #render read article view
    return render_template('read.html', article=article_, date=datetime.now())
Example #6
0
    def test_constructor_with_file_wikicorpus(self):
        
        #load tf-idf model
        tfidf_model = tfidfmodel.TfidfModel.load("/vagrant/data/test_tfidf.model")
        extractor = TfidfFeatureExtractor("/vagrant/data/test")
        
        #load tf-idf corpus
        tfidf_corpus = MmCorpus('/vagrant/data/test_tfidf_corpus.mm')
        
        #load lda corpus
        #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm')
        
        #load dictionary
        id2token = Dictionary.load("/vagrant/data/test_wordids.dict")
        
        #load article titles
        document_titles = DocumentTitles.load("/vagrant/test_articles.txt")
        
        #Connect to mongo database
        connect(self.config_['database']['db-name'], 
                username=self.config_['database']['user'],
                password=self.config_['database']['passwd'],
                port=self.config_['database']['port'])
        
        #Load articles as test corpus
        user = User.objects(email=u"*****@*****.**").first()
        
        ranked_article_ids = (a.article.id 
                              for a 
                              in RankedArticle.objects(user_id=user.id).only("article"))
        all_article_ids = set(a.id
                              for a 
                              in Article.objects(id__in=ranked_article_ids).only("id"))
        
        read_article_ids = set(a.article.id
                               for a 
                               in ReadArticleFeedback.objects(user_id=user.id).only("article"))
        
        unread_article_ids = all_article_ids - read_article_ids

        #sample test articles
        X, y = get_samples(extractor, read_article_ids, unread_article_ids)
        
        s,f = X.shape
        logger.debug("Training with %d samples, %d features, %d marks" % (s, f, len(y)))

        #train esa model
        esa_model = CosineEsaModel(tfidf_corpus, 
                                   document_titles=document_titles,
                                   test_corpus=X,
                                   test_corpus_targets=y,
                                   num_test_corpus=len(y),
                                   num_best_features=15,
                                   num_features=len(id2token))

        for line in esa_model:
            print repr(line)
        
        esa_model.save('/vagrant/data/test_cesa.model')
        
        tmp_esa = CosineEsaModel.load('/vagrant/data/test_cesa.model')
        print tmp_esa  
Example #7
0
def get_samples(extractor,
                read_article_ids, 
                unread_article_ids,
                p_synthetic_samples=300,
                p_majority_samples=500,
                k=5):
    """
    read_article_ids : Set
    unread_article_ids : Set
    n_synthetic_samples : Percentage of snythetic samples, 300 for 300%
    k : neighbourhood for k nearest neighbour, standard 5

    Returns
    -------
    array-like full vector samples, shape = [n_features, n_samples]
    array-like marks, shape = [n_samples]
    """
    
    #Under-sample unread ids
    unread_article_ids = set(sample(unread_article_ids,
                                    min(p_majority_samples/100 * len(read_article_ids), 
                                        len(unread_article_ids))
                                    )
                             )
    
    #Create unread article vectors
    unread_marks = np.empty(len(unread_article_ids))
    unread_marks.fill(UNREAD)
    unread_articles = np.empty(shape=(len(unread_article_ids), extractor.get_feature_number()))
    
    for i, article in enumerate(Article.objects(id__in=unread_article_ids)):
        try:
            article_features_as_full_vec = get_features(article, extractor)
            unread_articles[i,:] = article_features_as_full_vec[:]
        except AttributeError as e:
            logger.error("Article %s does not have attribute: %s." % (article.id, e))
            
    #Create read article vectors
    read_marks = np.empty(len(read_article_ids))
    read_marks.fill(READ)  
    read_articles = np.empty(shape=(len(read_article_ids),
                                    extractor.get_feature_number()))
    
    for i, article in enumerate(Article.objects(id__in = read_article_ids)):
        try:
            article_features_as_full_vec = get_features(article, extractor)
            read_articles[i,:] = article_features_as_full_vec[:]
        except AttributeError as e:
            logger.error("Article %s does not have attribute: %s." % (article.id, e))
    
    #SMOTE sample minorities
    #synthetic_read_articles = SMOTE(read_articles, p_synthetic_samples, k) 
    
    #borderlineSMOTE sample minorities
    X = np.concatenate((read_articles, unread_articles)) 
    y = np.concatenate((read_marks, unread_marks))
    new_read_articles, synthetic_read_articles, danger_read_articles = borderlineSMOTE(X=X, y=y, minority_target=READ,
                                                                                       N=p_synthetic_samples,
                                                                                       k=k)
    
    #Create synthetic read samples
    synthetic_marks = np.zeros(len(synthetic_read_articles))
    synthetic_marks.fill(READ)  
    
    read_marks = np.empty(len(new_read_articles))
    read_marks.fill(READ)  
    
    danger_read_marks = np.empty(len(danger_read_articles))
    danger_read_marks.fill(READ)   
    
    logger.info("Use %d read, %d unread, %d danger reads and %d synthetic samples." %
                (len(read_marks), len(unread_marks), 
                 len(danger_read_marks), len(synthetic_marks)))
    
    return (np.concatenate((new_read_articles, synthetic_read_articles, danger_read_articles, unread_articles)),
            np.concatenate((read_marks, synthetic_marks, danger_read_marks, unread_marks))
            )