Beispiel #1
0
def parse_data(tablename,site,url_list):
    for url in url_list:
        data=feedparser.parse(url)
        for news in data.entries:
            title=news['title']
            link=news['link']
            try:
                description=news['content'][0]['value']
                description=re.sub('<.*>','',description)

            except:
                description=news['summary']
            description=re.sub('<.*>','',description)
            slug=slugify(title)
            collection_obj=CollectionMapping(tablename)
            collection_obj.load_json({'site':site,'slug':slug,'name':title,'description':description,'link':link,})
Beispiel #2
0
            except:
                description=news['summary']
            description=re.sub('<.*>','',description)
            slug=slugify(title)
            collection_obj=CollectionMapping(tablename)
            collection_obj.load_json({'site':site,'slug':slug,'name':title,'description':description,'link':link,})

if __name__ == "__main__":

    # delete old news
    CollectionMapping('news_news').delete_all()
    CollectionMapping('news_category').delete_all()
    # fetch news feed
    for tablename, url_dict in news_dict.items():
        for site,url in url_dict.items():
            parse_data(tablename,site,url)
    # add training data
    category_set=TrainClassifier.train_classifier(news_training_dict)
    category_dict=dict([(slugify(category),1) for category in news_training_dict.keys()])
    # classify each document 
    for news in CollectionMapping('news_news').objects.all():
        bayes_obj=BayesClassifier(category_set)
        # returns each obj with category_list attribute ordered according to their score
        obj=bayes_obj.find_posterior("%s %s"%(news.name,news.description))
        news.update(category_list=map(lambda category:category[0],obj.category_list))
        for category,score in obj.category_list:category_dict[category]+=1
    category_obj=CollectionMapping('news_category')
    category_obj.save(type="education",category=category_dict)