Example #1
0
    def train(self, read_article_ids=None, unread_article_ids=None):
        """
        Trains the Bayes Classifier.
        read_article_ids should be an iterable over read article ids
        unread_article_ids should be an iterable over unread article ids
        
        If one is None it will be loaded from database.
        """
        
        #Load user feedback if needed
        if read_article_ids is None:
            read_article_ids = set(r.article.id for r
                                   in ReadArticleFeedback.objects(user_id=self.user.id).only("article"))
        else:
            read_article_ids = set(read_article_ids)
        
        logger.info("Use %d read articles for learning." % len(read_article_ids))
        read_articles = Article.objects(id__in=read_article_ids)

        #Get all articles the user did not read.
        if unread_article_ids is None:
            ranked_article_ids = (a.article.id for a in RankedArticle.objects(user_id=self.user.id).only("article"))
            all_article_ids = set(a.id for a in Article.objects(id__in=ranked_article_ids).only("id"))
            unread_article_ids = all_article_ids - read_article_ids
            
        #undersample unreads
        logger.info("Use %d unread articles for learning." % (len(unread_article_ids)))
        
        unread_articles = Article.objects(id__in=unread_article_ids)
        
        #convert all article features
        all_articles = UserModelBayes.AllArticles(read_articles, unread_articles, self.get_features)

        self.clf.fit(np.array(list(all_articles)), np.array(list(all_articles.get_marks())))
Example #2
0
 def test_rank(self):
     self.trainer.train()
     
     unread_doc = Article.objects(headline = u"Sony = Bad").first()
     read_doc = Article.objects(headline = u"Apple").first()
     
     rank_unread_doc = self.trainer.rank(unread_doc)
     rank_read_doc = self.trainer.rank(read_doc)
     
     self.assertEqual(rank_unread_doc, UserModelBayes.UNREAD) 
     self.assertEqual(rank_read_doc, UserModelBayes.READ) 
Example #3
0
 def train(self, read_article_ids = None, unread_article_ids = None):
     #Load user feedback if needed
     if read_article_ids is None:
         read_article_ids = (r.article.id for r in ReadArticleFeedback.objects(user_id = self.user.id).only("article"))
         
     user_feedback = Article.objects(id__in = read_article_ids)
     
     #TODO: cluster feedback articles and save more than one profile
     
     num_loaded_articles = 0
     centroid = numpy.zeros(self.num_features_, dtype=numpy.float32)
     
     for article in user_feedback:
         try:
             article_features_as_full_vec = self.get_features(article)
         except Exception as inst:
             logger.error("Could not get features for article %s: %s" %
                          (article.id, inst))
             continue
         
         #do we need this?
         tmp_doc = matutils.unitvec(article_features_as_full_vec)
         
         #add up tmp_doc
         centroid = numpy.add(centroid, tmp_doc)
         num_loaded_articles += 1 
         
     #average each element
     if num_loaded_articles != 0:
         centroid = centroid / num_loaded_articles
         
     centroid = matutils.full2sparse(centroid)
     
     #set user model data
     self.user_model_features = [centroid]
Example #4
0
def get_article_samples(config_):
    #Connect to mongo database
    logger.info("Connect to database...")
    connect(config_['database']['db-name'], 
            username= config_['database']['user'], 
            password= config_['database']['passwd'], 
            port = config_['database']['port'])
    
    #get user
    user = User.objects(email=u"*****@*****.**").first()
    
    ranked_article_ids = (a.article.id 
                          for a 
                          in RankedArticle.objects(user_id = user.id).only("article"))
    all_article_ids = Set(a.id 
                          for a 
                          in Article.objects(id__in = ranked_article_ids).only("id"))
    
    read_article_ids = Set(a.article.id 
                           for a 
                           in ReadArticleFeedback.objects(user_id = user.id).only("article"))
    
    unread_article_ids = all_article_ids - read_article_ids
    
    #sample test articles
    X, y = get_samples(extractor, read_article_ids, unread_article_ids)
    
    return X, y
Example #5
0
    def train(self, read_article_ids=None, unread_article_ids=None):
        """
        Trains the DecisionTree Classifier.
        read_article_ids should be an iterable over read article ids
        unread_article_ids should be an iterable over unread article ids
        
        If one is None it will be loaded from database.
        """
        
        #Load user feedback if needed
        if read_article_ids is None:
            read_article_ids = set(r.article.id
                                   for r in ReadArticleFeedback.objects(user_id=self.user.id).only("article"))
        else:
            read_article_ids = set(read_article_ids)

        #Get all articles the user did not read.
        if unread_article_ids is None:
            ranked_article_ids = (a.article.id
                                  for a in RankedArticle.objects(user_id=self.user.id).only("article"))
            all_article_ids = set(a.id
                                  for a in Article.objects(id__in=ranked_article_ids).only("id"))
            unread_article_ids = all_article_ids - read_article_ids
        
        #convert all article features
        all_articles, marks = self._get_samples(read_article_ids, 
                                                unread_article_ids,
                                                p_synthetic_samples=self.p_synthetic_samples,
                                                p_majority_samples=self.p_majority_samples)

        logger.debug("Learn on %d samples." % len(marks))            

        self.clf = tree.DecisionTreeClassifier()
        self.clf.fit(all_articles, marks)
Example #6
0
 def get_articles(self, date):
     '''
     Returns list of articles between date 0:00 and date 24:00
     '''
     
     #use select_related = 2 to fetch all vendor data
     articles_ = Article.objects(vendor__in=current_user.mongodb_user.subscriptions, 
                             date__gte = date.date(), 
                             date__lt = date.date() + timedelta(days=1)).select_related(2)
     
     #mark articles as read/unread and add id field
     articles_as_dict = []
     for a in articles_:
         #check in database if article has Read Feedback
         feedback = ReadArticleFeedback.objects(user_id = self.mongodb_user.id,
                                                article = a).first()
         
         tmp_article = a._data
         
         if feedback is None:
             tmp_article['read'] = False
         else:
             tmp_article['read'] = True 
             
         tmp_article['id'] = a.id
         
         articles_as_dict.append(tmp_article)
 
     return articles_as_dict
Example #7
0
def ajax_add_user():
    '''
    Called remotely to add a new user.
    '''
    if not current_user.is_authenticated():
        abort(403)

    name = request.form['name']
    email = request.form['email'].lower()
    new_password = request.form['new_password']
    new_password_repeat = request.form['new_password_repeat'] 
        
    if current_user.mongodb_user.email != "*****@*****.**":
        abort(403)
        
    #check passwords
    if new_password != new_password_repeat:
        abort(400)
        
    if new_password == "":
        abort(400)
        
    #hash password
    m = hashlib.sha256()
    m.update(new_password.encode("UTF-8"))
    m.update(SALT.encode("UTF-8"))
        
    #check if user with email address already exists
    users_with_same_email = User.objects(email = email)
    if len(users_with_same_email) > 0:
        abort(400)
        
    try:
        app.logger.debug("Adding new user %s" % name)
            
        #just pick the first article as feedback
        first_article = Article.objects().first()
        first_profile = LearnedProfile(features = first_article.features)
            
        new_user = User(name = name, password = m.hexdigest(),
                        email = email,
                        learned_profile = [first_profile])
        new_user.save(safe=True)
        
        first_feedback = ReadArticleFeedback(user_id = new_user.id,
                                            article = first_article, 
                                            score = 1.0)
        first_feedback.save()
            
        app.logger.debug("...done.")
    except Exception as inst:
        app.logger.error("Could not add new user: %s: %s" % (type(inst), type))
        abort(500)
        
    return ""
Example #8
0
 def get_top_articles(self, date, min_rating):
     '''
     Returns iterator to articles from date and with a rating bigger than
     min_rating.
     '''
     
     #get all articles from specific date
     articles_from_date = Article.objects(date__gte = date.date(), 
                     date__lt = date.date() + timedelta(days=1))
     
     #get all ranked article form loaded articles
     return [a.article for a in RankedArticle.objects(user_id = self.mongodb_user.id, 
                                  rating__gte = min_rating,
                                  article__in = articles_from_date)]
Example #9
0
def read(key):
    try:
        article_ = Article.objects(id = key).first()
    except ValidationError as ve:
        app.logger.error("Error on reading %s (%s): %s" % (key, type(ve), ve))
        article_ = None
        
    if article_ == None:
        return render_template('no_article.html', 
                               date=datetime.now())           
            
    #save user feedback
    current_user.save_read_article_feedback(article = article_,
                                            score = 1.0)
            
    #render read article view
    return render_template('read.html', 
                           article= article_,
                           date=datetime.now())
 def get_texts(self):
     '''
     Files are processed parallel.
     
     See wikicorpus.py by Radim Rehurek
     '''
     logger = logging.getLogger("feature_extractor")
     
     processed_articles = 0
     for article in  Article.objects():
         if processed_articles % 1000 == 0:
             logger.info("Processing article #%d..." % processed_articles)
             
         processed_articles += 1
         
         try:
             doc = article.clean_content
             tokens = utils.lemmatize(doc)
             yield tokens
         except Exception as e:
             logger.error("Could not process article %s (%s): %s" %
                          (article.id, type(e), e))
     
     logger.info("Processed %d articles." % processed_articles)
Example #11
0
def get_samples(extractor,
                read_article_ids, 
                unread_article_ids,
                p_synthetic_samples = 300,
                p_majority_samples = 500,
                k = 5):
    '''
    read_article_ids : Set
    unread_article_ids : Set
    n_synthetic_samples : Percentage of snythetic samples, 300 for 300%
    k : neighbourhood for k nearest neighbour, standard 5

    Returns
    -------
    array-like full vector samples, shape = [n_features, n_samples]
    array-like marks, shape = [n_samples]
    '''
    
    #Under-sample unread ids
    unread_article_ids = Set(sample(unread_article_ids, 
                                    min(p_majority_samples/100 * len(read_article_ids), 
                                        len(unread_article_ids))
                                    )
                             )
    
    #Create unread article vectors
    unread_marks = np.empty(len(unread_article_ids))
    unread_marks.fill(UNREAD)
    unread_articles = np.empty(shape=(len(unread_article_ids), 
                                         extractor.get_feature_number()))
    
    
    for i, article in enumerate(Article.objects(id__in = unread_article_ids)):
        try:
            article_features_as_full_vec = get_features(article, extractor)
            unread_articles[i,:] = article_features_as_full_vec[:]
        except AttributeError as e:
            logger.error("Article %s does not have attribute: %s." 
                         % (article.id, e))  
            
    #Create read article vectors
    read_marks = np.empty(len(read_article_ids))
    read_marks.fill(READ)  
    read_articles = np.empty(shape=(len(read_article_ids), 
                                         extractor.get_feature_number()))
    
    for i, article in enumerate(Article.objects(id__in = read_article_ids)):
        try:
            article_features_as_full_vec = get_features(article, extractor)
            read_articles[i,:] = article_features_as_full_vec[:]
        except AttributeError as e:
            logger.error("Article %s does not have attribute: %s." 
                         % (article.id, e))           
    
    #SMOTE sample minorities
    #synthetic_read_articles = SMOTE(read_articles, p_synthetic_samples, k) 
    
    #borderlineSMOTE sample minorites
    X = np.concatenate((read_articles, unread_articles)) 
    y = np.concatenate((read_marks, unread_marks))
    new_read_articles, synthetic_read_articles, danger_read_articles = borderlineSMOTE(X = X,
                                                                                    y = y,
                                                                                    minority_target = READ,
                                                                                    N = p_synthetic_samples, k = k)
    
    #Create synthetic read samples
    synthetic_marks = np.zeros(len(synthetic_read_articles))
    synthetic_marks.fill(READ)  
    
    read_marks = np.empty(len(new_read_articles))
    read_marks.fill(READ)  
    
    danger_read_marks = np.empty(len(danger_read_articles))
    danger_read_marks.fill(READ)   
    
    logger.info("Use %d read, %d unread, %d danger reads and %d synthetic samples." %
                (len(read_marks), len(unread_marks), 
                 len(danger_read_marks), len(synthetic_marks)))
    
    return (np.concatenate((new_read_articles, 
                              synthetic_read_articles, 
                              danger_read_articles,
                              unread_articles)),
            np.concatenate((read_marks, 
                              synthetic_marks, 
                              danger_read_marks,
                              unread_marks))
            )
Example #12
0
        password=config_["database"]["passwd"],
        port=config_["database"]["port"],
    )

    # Load feature extractor
    # feature_extractor = EsaFeatureExtractor(prefix = config_['prefix'])
    # feature_extractor = TfidfFeatureExtractor(prefix = config_['prefix'])
    # feature_extractor = LdaFeatureExtractor(prefix = config_['prefix'])
    # feature_extractor = LdaBowFeatureExtractor(prefix = config_['prefix'])
    feature_extractor = cEsaFeatureExtractor(prefix=config_["prefix"])

    # get user
    user = User.objects(email=u"*****@*****.**").first()

    ranked_article_ids = (a.article.id for a in RankedArticle.objects(user_id=user.id).only("article"))
    all_article_ids = set(a.id for a in Article.objects(id__in=ranked_article_ids).only("id"))

    read_article_ids = set(a.article.id for a in ReadArticleFeedback.objects(user_id=user.id).only("article"))

    unread_article_ids = all_article_ids - read_article_ids

    for p_synthetic in xrange(100, 700, 100):
        for p_majority in xrange(100, 700, 100):

            logger.info("Synthetic over-sampling %d and majority undersampling %d" % (p_synthetic, p_majority))

            # run test N_ITERATIONS
            precisions_read = np.zeros((N_ITERATIONS))
            recalls_read = np.zeros((N_ITERATIONS))
            f1_scores_read = np.zeros((N_ITERATIONS))
            precisions_unread = np.zeros((N_ITERATIONS))
Example #13
0
    #Load feature extractor
    #feature_extractor = EsaFeatureExtractor(prefix = config_['prefix'])
    #feature_extractor = TfidfFeatureExtractor(prefix = config_['prefix'])
    #feature_extractor = LdaFeatureExtractor(prefix = config_['prefix'])
    #feature_extractor = LdaBowFeatureExtractor(prefix = config_['prefix'])
    feature_extractor = cEsaFeatureExtractor(prefix = config_['prefix'])
    
    #get user
    user = User.objects(email=u"*****@*****.**").first()
    
    ranked_article_ids = (a.article.id 
                          for a 
                          in RankedArticle.objects(user_id = user.id).only("article"))
    all_article_ids = Set(a.id 
                          for a 
                          in Article.objects(id__in = ranked_article_ids).only("id"))
    
    read_article_ids = Set(a.article.id 
                           for a 
                           in ReadArticleFeedback.objects(user_id = user.id).only("article"))
    
    unread_article_ids = all_article_ids - read_article_ids

    for p_synthetic in xrange(100, 700, 100):
        for p_majority in xrange(100, 700, 100): 
            
            logger.info("Synthetic over-sampling %d and majority undersampling %d" %
                        (p_synthetic, p_majority))
            
            #run test N_ITERATIONS
            precisions_read = np.zeros((N_ITERATIONS))
Example #14
0
    def test_constructor_with_file_wikicorpus(self):
        
        #load tf-idf model
        tfidf_model = tfidfmodel.TfidfModel.load("/media/sdc1/test_dump/result/test_tfidf.model")
        extractor = TfidfFeatureExtractor("/media/sdc1/test_dump/result/test")
        
        #load tf-idf corpus
        tfidf_corpus = MmCorpus('/media/sdc1/test_dump/result/test_tfidf_corpus.mm')
        
        #load lda corpus
        #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm')
        
        #load dictionary
        id2token = Dictionary.load("/media/sdc1/test_dump/result/test_wordids.dict")
        
        #load article titles
        document_titles = DocumentTitles.load("/media/sdc1/test_dump/result/test_articles.txt")
        
        #Connect to mongo database
        connect(self.config_['database']['db-name'], 
                username= self.config_['database']['user'], 
                password= self.config_['database']['passwd'], 
                port = self.config_['database']['port'])
        
        #Load articles as test corpus
        user = User.objects(email=u"*****@*****.**").first()
        
        ranked_article_ids = (a.article.id 
                              for a 
                              in RankedArticle.objects(user_id = user.id).only("article"))
        all_article_ids = Set(a.id 
                              for a 
                              in Article.objects(id__in = ranked_article_ids).only("id"))
        
        read_article_ids = Set(a.article.id 
                               for a 
                               in ReadArticleFeedback.objects(user_id = user.id).only("article"))
        
        unread_article_ids = all_article_ids - read_article_ids

        #sample test articles
        X, y = get_samples(extractor, read_article_ids, unread_article_ids)
        
        s,f = X.shape
        logger.debug("Traning with %d samples, %d features, %d marks" % 
                     (s,f, len(y)))

        #train esa model
        esa_model = CosineEsaModel(tfidf_corpus, 
                                   document_titles = document_titles,
                                   test_corpus = X, 
                                   test_corpus_targets = y, 
                                   num_test_corpus = len(y),
                                   num_best_features = 15,
                                   num_features = len(id2token))
        
        print esa_model
        
        esa_model.save('/media/sdc1/test_dump/result/test_cesa.model')
        
        tmp_esa = CosineEsaModel.load('/media/sdc1/test_dump/result/test_cesa.model') 
        print tmp_esa  
Example #15
0
    def _get_samples(self, 
                     read_article_ids, 
                     unread_article_ids,
                     p_synthetic_samples = 300,
                     p_majority_samples = 500,
                     k = 5):
        '''
        read_article_ids : Set
        unread_article_ids : Set
        p_synthetic_samples : Percentage of snythetic samples, 300 for 300% 
                              If None no are created 
        p_majority_samples : Size of majority sample = p_majority_samples/n_minority_sample, 
                             500 for 500%
                             If None under sampling ist not done
        k : neighbourhood for k nearest neighbour, standard 5

        Returns
        -------
        array-like full vector samples, shape = [n_features, n_samples]
        array-like marks, shape = [n_samples]
        '''
        
        #Under-sample unread ids
        if p_majority_samples is not None:
            unread_article_ids = Set(sample(unread_article_ids, 
                                            min(p_majority_samples/100 * len(read_article_ids), 
                                                len(unread_article_ids))
                                            )
                                     )
        
        #Create unread article vectors
        unread_marks = numpy.empty(len(unread_article_ids))
        unread_marks.fill(UserModelSVM.UNREAD)
        unread_articles = numpy.empty(shape=(len(unread_article_ids), 
                                             self.num_features_))
        
        
        for i, article in enumerate(Article.objects(id__in = unread_article_ids)):
            try:
                article_features_as_full_vec = self.get_features(article)
                unread_articles[i,:] = article_features_as_full_vec[:]
            except AttributeError as e:
                logger.error("Article %s does not have attribute: %s." 
                             % (article.id, e))  
                
        #Create read article vectors
        read_marks = numpy.empty(len(read_article_ids))
        read_marks.fill(UserModelSVM.READ)  
        read_articles = numpy.empty(shape=(len(read_article_ids), 
                                             self.num_features_))
        
        for i, article in enumerate(Article.objects(id__in = read_article_ids)):
            try:
                article_features_as_full_vec = self.get_features(article)
                read_articles[i,:] = article_features_as_full_vec[:]
            except AttributeError as e:
                logger.error("Article %s does not have attribute: %s." 
                             % (article.id, e))           
        
        #SMOTE sample minorities
        #synthetic_read_articles = SMOTE(read_articles, p_synthetic_samples, k) 
        
        #borderlineSMOTE sample minorites if p_synthetic_samples not None
        X = numpy.concatenate((read_articles, unread_articles)) 
        
        self._calculate_mean_and_std_deviation(X)
        X = self._normalize(X)
        
        y = numpy.concatenate((read_marks, unread_marks))
        if p_synthetic_samples is None:
            return X, y
        else:
            new_read_articles, synthetic_read_articles, danger_read_articles = borderlineSMOTE(X = X,
                                                                                               y = y,
                                                                                               minority_target = UserModelSVM.READ,
                                                                                               N = p_synthetic_samples, k = k)
            
            #Create synthetic read samples
            synthetic_marks = numpy.zeros(len(synthetic_read_articles))
            synthetic_marks.fill(UserModelSVM.READ)  
            
            read_marks = numpy.empty(len(new_read_articles))
            read_marks.fill(UserModelSVM.READ)  
            
            danger_read_marks = numpy.empty(len(danger_read_articles))
            danger_read_marks.fill(UserModelSVM.READ)   
            
            logger.info("Use %d read, %d unread, %d danger reads and %d synthetic samples." %
                        (len(read_marks), len(unread_marks), 
                         len(danger_read_marks), len(synthetic_marks)))
        
            return (numpy.concatenate((new_read_articles, 
                                       synthetic_read_articles, 
                                       danger_read_articles,
                                       unread_articles)),
                    numpy.concatenate((read_marks, 
                                      synthetic_marks, 
                                      danger_read_marks,
                                      unread_marks))
                    )  
Example #16
0
    def train(self, read_article_ids = None, unread_article_ids = None):
        '''
        Trains the several SVM and Naive Bayes Classifiers.
        read_article_ids should be an iterable over read article ids
        unread_article_ids should be an iterable over unread article ids
        
        If one is None it will be loaded from database.
        '''
        
        #Load user feedback if needed
        if read_article_ids is None:
            read_article_ids = Set(r.article.id 
                                for r 
                                in ReadArticleFeedback.objects(user_id = self.user.id).only("article"))
        else:
            read_article_ids = Set(read_article_ids)

        #Get all articles the user did not read.
        if unread_article_ids is None:
            ranked_article_ids = (a.article.id 
                               for a 
                               in RankedArticle.objects(user_id = self.user.id).only("article"))
            all_article_ids = Set(a.id 
                                  for a 
                                  in Article.objects(id__in = ranked_article_ids).only("id"))
            unread_article_ids = all_article_ids - read_article_ids
        
        classifiers = [lambda: svm.SVC(kernel='rbf'), 
                       lambda: svm.SVC(kernel='rbf'),
                       lambda: svm.SVC(kernel='rbf'),
                       lambda: svm.SVC(kernel='rbf'),
                       lambda: svm.SVC(kernel='rbf'),
                       GaussianNB, 
                       GaussianNB, 
                       GaussianNB, 
                       GaussianNB]
        
        parameters = [#SVM
                      {'read_article_ids': read_article_ids, 
                       'unread_article_ids': unread_article_ids,
                       'p_synthetic_samples': 100,
                       'p_majority_samples': 200,
                       'k': 10},
                      {'read_article_ids': read_article_ids, 
                       'unread_article_ids': unread_article_ids,
                       'p_synthetic_samples': 200,
                       'p_majority_samples': 300,
                       'k': 10},
                      {'read_article_ids': read_article_ids, 
                       'unread_article_ids': unread_article_ids,
                       'p_synthetic_samples': 300,
                       'p_majority_samples': 400,
                       'k': 10},
                      {'read_article_ids': read_article_ids, 
                       'unread_article_ids': unread_article_ids,
                       'p_synthetic_samples': 400,
                       'p_majority_samples': 500,
                       'k': 10},
                      {'read_article_ids': read_article_ids, 
                       'unread_article_ids': unread_article_ids,
                       'p_synthetic_samples': 500,
                       'p_majority_samples': 600,
                       'k': 10},
                      #Naive Bayes
                      {'read_article_ids': read_article_ids, 
                       'unread_article_ids': unread_article_ids,
                       'p_synthetic_samples': 100,
                       'p_majority_samples': 100,
                       'k': 10},
                      {'read_article_ids': read_article_ids, 
                       'unread_article_ids': unread_article_ids,
                       'p_synthetic_samples': 100,
                       'p_majority_samples': 200,
                       'k': 10},
                      {'read_article_ids': read_article_ids, 
                       'unread_article_ids': unread_article_ids,
                       'p_synthetic_samples': 300,
                       'p_majority_samples': 500,
                       'k': 10},
                      {'read_article_ids': read_article_ids, 
                       'unread_article_ids': unread_article_ids,
                       'p_synthetic_samples': 600,
                       'p_majority_samples': 600,
                       'k': 10}]
        
        self._call_classifiers(classifiers, parameters)
Example #17
0
 #Connect to mongo database
 try:
     connect(config['database']['db-name'], 
             username= config['database']['user'], 
             password= config['database']['passwd'], 
             port = config['database']['port'])
 except connection.ConnectionError as e:
     logger.error("Could not connect to mongodb: %s" % e)
     sys.exit(1)
 
 
 feature_extractor = EsaFeatureExtractor(prefix = config['prefix'] )
 
 #go through each article and convert features
 count = 0
 for article in Article.objects(features__version__ne = feature_extractor.get_version()):
     if count % 10 == 0:
         logger.info("PROGRESS: processing article #%d" % count)
     count += 1
     
     if article.features.version == EsaFeatureExtractor.get_version():
         continue
     
     clean_content = article.clean_content
     
     #get new features
     new_features = feature_extractor.get_features(clean_content)
     
     #save new features
     features = Features(version = feature_extractor.get_version(), data = new_features)
     article.features = features