def test_bbc_story(): browser = make_browser() story = get_story(browser, 'http://www.bbc.com/news/world-us-canada-41973952', sites['bbc']['story_xpath']) assert len(story['title']) > 0 assert len(story['desc']) > 0 assert len(story['story']) > 0 assert len(story['image']) > 0
def test_nola_story(): story = get_story( make_browser(), 'http://www.nola.com/northshore/index.ssf/2018/02/three_st_tammany_students_accu.html#incart_2box_nola_river_orleans_news', sites['nola']['story_xpath']) assert len(story['title']) > 0 assert len(story['desc']) > 0 assert len(story['story']) > 0 assert len(story['image']) > 0
def test_usa_story(): browser = make_browser() story = get_story( browser, 'https://www.usatoday.com/story/money/nation-now/2018/02/26/trump-just-claimed-u-s-makes-better-solar-panels-than-china-thats-not-quite-right/375307002/', sites['usa']['story_xpath']) assert len(story['title']) > 0 assert len(story['desc']) > 0 assert len(story['story']) > 0 assert len(story['image']) > 0
def test_guardian_story(): browser = make_browser() story = get_story( browser, 'https://www.theguardian.com/us-news/2017/nov/09/one-year-later-trump-takes-a-grand-tour-of-asia-as-clinton-visits-wisconsin-finally', sites['guardian']['story_xpath']) assert len(story['title']) > 0 assert len(story['desc']) > 0 assert len(story['story']) > 0 assert len(story['image']) > 0
def test_nyTimes_story(): browser = make_browser() story = get_story( browser, 'https://www.nytimes.com/2017/11/09/opinion/nuisance-ordinances-eviction-violence.html?action=click&pgtype=Homepage&clickSource=story-heading&module=opinion-c-col-left-region®ion=opinion-c-col-left-region&WT.nav=opinion-c-col-left-region', sites['nyTimes']['story_xpath']) assert len(story['title']) > 0 assert len(story['desc']) > 0 assert len(story['story']) > 0 assert len(story['image']) > 0
def test_npr_story(): browser = make_browser() story = get_story( browser, 'https://www.npr.org/2018/02/27/585133064/lawmakers-agree-on-paid-family-leave-but-not-the-details', sites['npr']['story_xpath']) assert len(story['title']) > 0 assert len(story['desc']) > 0 assert len(story['story']) > 0 assert len(story['image']) > 0
def test_nbc_story(): browser = make_browser() story = get_story( browser, 'https://www.nbcnews.com/news/us-news/parkland-school-shooting-stoneman-douglas-students-prepare-confront-memories-they-n851656', sites['nbc']['story_xpath']) assert len(story['title']) > 0 assert len(story['desc']) > 0 assert len(story['story']) > 0 assert len(story['image']) > 0
def test_la_times_story(): browser = make_browser() story = get_story( browser, 'http://www.latimes.com/politics/la-na-pol-jared-kushner-20180227-story.html', sites['la_times']['story_xpath']) assert len(story['title']) > 0 assert len(story['desc']) > 0 assert len(story['story']) > 0 assert len(story['image']) > 0
def test_cbs_story(): browser = make_browser() story = get_story( browser, 'https://www.cbsnews.com/news/brad-parscale-trump-2020-campagin-manager-announced-today-2018-02-27/', sites['cbs']['story_xpath']) assert len(story['title']) > 0 assert len(story['desc']) > 0 assert len(story['story']) > 0 assert len(story['image']) > 0
def test_metro_story(): browser = make_browser() story = get_story( browser, 'http://metro.co.uk/2018/02/27/kevin-spacey-foundation-shut-uk-actor-faces-sexual-assault-allegations-7347287/', sites['metro']['story_xpath']) assert len(story['title']) > 0 assert len(story['desc']) > 0 assert len(story['story']) > 0 assert len(story['image']) > 0
def test_verge_story(): browser = make_browser() story = get_story( browser, 'https://www.theverge.com/2018/2/27/17060092/blackberry-world-app-store-paid-apps-discontinuation-removal-april-1', sites['verge']['story_xpath']) assert len(story['title']) > 0 assert len(story['desc']) > 0 assert len(story['story']) > 0 assert len(story['image']) > 0
def test_fox_story(): browser = make_browser() story = get_story( browser, 'http://www.foxnews.com/politics/2018/02/27/supreme-court-rules-that-detained-immigrants-dont-get-automatic-bond-hearings.html', sites['fox']['story_xpath']) assert len(story['title']) > 0 assert len(story['desc']) > 0 assert len(story['story']) > 0 assert len(story['image']) > 0
def test_eOnline_story(): browser = make_browser() story = get_story( browser, 'http://www.eonline.com/news/893550/did-kylie-jenner-have-a-private-baby-shower-all-the-details-on-her-pink-filled-celebration', sites['eOnline']['story_xpath']) assert len(story['title']) > 0 assert len(story['desc']) > 0 assert len(story['story']) > 0 assert len(story['image']) > 0
def main(): browser = config.connect_browser() db = Redis(host=config.REDIS_HOST, port=config.REDIS_PORT) # We want to not look like a bot, I found that initially get all the links to possible scrape # then shuffling them looks much less like a bot. links = [] for name, job in config.WORK.items(): new_links = [] try: print(job['url']) new_links = get_links(browser, job['url'], job['link_regex']) except Exception as e: # Browser sessions get a little funky, in this case refresh the connection browser = config.connect_browser() print(e) links += [(name, link) for link in new_links] random.shuffle(links) channel = config.setup_mq(config.QUEUE_NAME) for link in list(set(links)): # Avoid doing unnessary duplicate work if db.exists(link): continue # Just in case time.sleep(random.randint(1, 8)) try: print(link) story = get_story(browser, link[1], config.WORK[link[0]]['story_xpath']) except Exception as e: print(e) browser = config.connect_browser() continue # Quick filtering to avoid invalid stories if len(story['story']) == 0: continue # Often connections are lost, reconnect in those cases try: publish_story(channel, config.QUEUE_NAME, story) except Exception: channel = config.setup_mq( config.QUEUE_NAME) # Refresh the connection