def test_get_article_attri():
    print '\nfunction: %s ' % inspect.stack()[0][3]

    my_mongo = MyMongo(dbname='nytimes')
    # print my_mongo.get_one_article()
    article_atrri, article_dt = my_mongo.get_article_attri(testing=1)
    print '%i items retrieved' % (len(article_atrri))
    print 'title:'
    print zip(article_atrri)[:5]
    print 'publication date'
    print zip(article_dt)[:5]

    my_mongo.close()
    n.assert_greater(len(article_atrri), 2)
def read_articles():
    '''
    read all articles as dataframe from mongodb collection 'articles'
        - INPUT: None
        - OUTPUT: df.   columns: title, url, uri, body_text, 
    '''
    my_mongo = MyMongo()

    t0 = time.time()
    cur_articles = my_mongo.get_article_body_text(testing=0)

    articles_cleaned = {}
    # print '%d unique articles ' % len(articles_cleaned)
    clean_articles(cur_articles, articles_cleaned)
    print '%d unique articles with body_text' % len(articles_cleaned)

    t1 = time.time()  # time it
    print "finished in  %4.4fmin for %s " % ((t1 - t0) / 60, 'read/clean articles')

    df = pd.DataFrame([{'url': k, 'body_text': v[1]}
                       for k, v in articles_cleaned.items()])

    article_dict,  article_dt = MyMongo().get_article_attri()
    #article_dict_all = dict(article_dict)
    df['title'] = df['url'].map(lambda x: article_dict.get(x, 'Unknown'))
    df['uri'] = df['url'].map(lambda x: parse_url(x).host)
    df['dt'] = df['url'].map(lambda x: article_dt.get(x, ''))

    my_mongo.close()
    return df
import sys
# sys.path.append('../db')

from configobj import ConfigObj
config = ConfigObj('../allds.config')
allds_home = config['allDS_home']
sys.path.append(allds_home + '/code/db')

from my_mongo import MyMongo
from httplib import BadStatusLine
from boilerpipe.extract import Extractor

if __name__ == '__main__':

    my_mongo = MyMongo()
    query = {'raw_html': {'$exists': 1}, 'body_text': {'$exists': 0}}
    cur_articles = my_mongo.get_articles(query=query)

    articles = list(cur_articles)
    print '%d articles to be processed. ' % (len(articles))
    for a in articles:
        try:
            extractor = Extractor(
                extractor='ArticleExtractor', html=a['raw_html'])
            extracted_text = extractor.getText()
            l = extracted_text.split('\n')
            a_id = a['_id']
            my_mongo.update_record(
                'articles', a_id, 'body_text', extracted_text)
            # print(extracted_text)