# sys.path.append('../db')

from configobj import ConfigObj
config = ConfigObj('../allds.config')
allds_home = config['allDS_home']
sys.path.append(allds_home + '/code/db')

from my_mongo import MyMongo
from httplib import BadStatusLine
from boilerpipe.extract import Extractor

if __name__ == '__main__':

    my_mongo = MyMongo()
    query = {'raw_html': {'$exists': 1}, 'body_text': {'$exists': 0}}
    cur_articles = my_mongo.get_articles(query=query)

    articles = list(cur_articles)
    print '%d articles to be processed. ' % (len(articles))
    for a in articles:
        try:
            extractor = Extractor(
                extractor='ArticleExtractor', html=a['raw_html'])
            extracted_text = extractor.getText()
            l = extracted_text.split('\n')
            a_id = a['_id']
            my_mongo.update_record(
                'articles', a_id, 'body_text', extracted_text)
            # print(extracted_text)
# do something with page
        except BadStatusLine: