def read_articles(): ''' read all articles as dataframe from mongodb collection 'articles' - INPUT: None - OUTPUT: df. columns: title, url, uri, body_text, ''' my_mongo = MyMongo() t0 = time.time() cur_articles = my_mongo.get_article_body_text(testing=0) articles_cleaned = {} # print '%d unique articles ' % len(articles_cleaned) clean_articles(cur_articles, articles_cleaned) print '%d unique articles with body_text' % len(articles_cleaned) t1 = time.time() # time it print "finished in %4.4fmin for %s " % ((t1 - t0) / 60, 'read/clean articles') df = pd.DataFrame([{'url': k, 'body_text': v[1]} for k, v in articles_cleaned.items()]) article_dict, article_dt = MyMongo().get_article_attri() #article_dict_all = dict(article_dict) df['title'] = df['url'].map(lambda x: article_dict.get(x, 'Unknown')) df['uri'] = df['url'].map(lambda x: parse_url(x).host) df['dt'] = df['url'].map(lambda x: article_dt.get(x, '')) my_mongo.close() return df