def test_generate_story(): """Test generate_story().""" db = connect_to_db() story_content = '<title>foo bar</title><meta content="2016-01-12T03:55:46Z" itemprop="datePublished"/>' story_url = 'http://foo.com/foo/bar' story = generate_story(db=db, url=story_url, content=story_content) assert 'stories_id' in story assert story['title'] == 'foo bar' assert story['publish_date'] == '2016-01-12 03:55:46' assert story['url'] == story_url assert story['guid'] == story_url medium = db.require_by_id('media', story['media_id']) assert medium['name'] == 'foo.com' assert medium['url'] == 'http://foo.com/' feed = db.query( "select f.* from feeds f join feeds_stories_map fsm using ( feeds_id ) where stories_id = %(a)s", { 'a': story['stories_id'] }).hash() assert feed is not None assert feed['name'] == SPIDER_FEED_NAME (date_tag, date_tag_set) = get_story_date_tag(db, story) assert date_tag['tag'] == 'guess_by_tag_meta' assert date_tag_set['name'] == GUESS_METHOD_TAG_SET download = db.query("select * from downloads where stories_id = %(a)s", { 'a': story['stories_id'] }).hash() assert download is not None assert download['url'] == story['url'] content = fetch_content(db, download) assert content == story_content story = generate_story( db=db, url='http://fallback.date', content='foo', fallback_date='2011-11-11', ) assert story['publish_date'] == '2011-11-11 00:00:00' matched_story = generate_story(db, story['url'], 'foo') assert matched_story['stories_id'] == story['stories_id'] story = generate_story(db=db, url='invalid url', content='foo') assert story is not None
def _add_tweet_story(db: DatabaseHandler, topic: Dict[str, Any], tweet: dict, topic_fetch_urls: List[Dict[str, Any]]) -> dict: """Generate a story based on the given tweet, as returned by the twitter api.""" screen_name = tweet['user']['screen_name'] content = tweet['text'] title = f"{screen_name}: {content}" tweet_date = tweet['created_at'] url = f"https://twitter.com/{screen_name}/status/{tweet['id']}" story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date) add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True) for topic_fetch_url in topic_fetch_urls: topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story) try_update_topic_link_ref_stories_id(db, topic_fetch_url) urls = get_tweet_urls(tweet) for url in urls: if skip_self_linked_domain_url(db, topic['topics_id'], story['url'], url): log.debug("skipping self linked domain url...") continue topic_link = { 'topics_id': topic['topics_id'], 'stories_id': story['stories_id'], 'url': url, } db.create('topic_links', topic_link) increment_domain_links(db, topic_link) return story
def _add_user_story(db: DatabaseHandler, topic: dict, user: dict, topic_fetch_urls: list) -> dict: """Generate a story based on the given user, as returned by the twitter api.""" content = f"{user['name']} ({user['screen_name']}): {user['description']}" title = f"{user['name']} ({user['screen_name']}) | Twitter" tweet_date = sql_now() url = f"https://twitter.com/{user['screen_name']}" story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date) add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True) for topic_fetch_url in topic_fetch_urls: topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story) try_update_topic_link_ref_stories_id(db, topic_fetch_url) # twitter user pages are undateable because there is never a consistent version of the page undateable_tag = _get_undateable_tag(db) stories_id = story['stories_id'] tags_id = undateable_tag['tags_id'] db.query(""" INSERT INTO public.stories_tags_map (stories_id, tags_id) VALUES (%(stories_id)s, %(tags_id)s) ON CONFLICT (stories_id, tags_id) DO NOTHING """, { 'stories_id': stories_id, 'tags_id': tags_id, }) return story
def _add_user_story(db: DatabaseHandler, topic: dict, user: dict, topic_fetch_urls: list) -> dict: """Generate a story based on the given user, as returned by the twitter api.""" content = '%s (%s): %s' % (user['name'], user['screen_name'], user['description']) title = '%s (%s) | Twitter' % (user['name'], user['screen_name']) tweet_date = sql_now() url = 'https://twitter.com/%s' % user['screen_name'] story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date) add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True) for topic_fetch_url in topic_fetch_urls: topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story) try_update_topic_link_ref_stories_id(db, topic_fetch_url) # twitter user pages are undateable because there is never a consistent version of the page undateable_tag = _get_undateable_tag(db) db.query( "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)", {'a': story['stories_id'], 'b': undateable_tag['tags_id']}) return story
def _try_fetch_topic_url(db: DatabaseHandler, topic_fetch_url: dict, domain_timeout: Optional[int] = None) -> None: """Implement the logic of fetch_topic_url without the try: or the topic_fetch_url update.""" log.info(f"Trying to fetch topic URL {topic_fetch_url['url']}...") # don't reprocess already processed urls if topic_fetch_url['state'] not in (FETCH_STATE_PENDING, FETCH_STATE_REQUEUED): log.info( f"URL's state '{topic_fetch_url['state']}' is not pending or requeued, not refetching" ) return log.info("Checking ignore links...") _update_tfu_message(db, topic_fetch_url, "checking ignore links") if _ignore_link_pattern(topic_fetch_url['url']): log.info("Link is to be ignored, returning") topic_fetch_url['state'] = FETCH_STATE_IGNORED topic_fetch_url['code'] = 403 return log.info("Checking failed URL...") _update_tfu_message(db, topic_fetch_url, "checking failed url") failed_url = _get_failed_url(db, topic_fetch_url['topics_id'], topic_fetch_url['url']) if failed_url: log.info("URL is failed, returning") topic_fetch_url['state'] = failed_url['state'] topic_fetch_url['code'] = failed_url['code'] topic_fetch_url['message'] = failed_url['message'] return log.info("Checking self-linked domain...") _update_tfu_message(db, topic_fetch_url, "checking self linked domain") if skip_self_linked_domain(db, topic_fetch_url): log.info("Link is self-linked domain, returning") topic_fetch_url['state'] = FETCH_STATE_SKIPPED topic_fetch_url['code'] = 403 return log.info(f"Fetching topic {topic_fetch_url['topics_id']}...") topic = db.require_by_id('topics', topic_fetch_url['topics_id']) topic_fetch_url['fetch_date'] = datetime.datetime.now() story_match = None # this match is relatively expensive, so only do it on the first 'pending' request and not the potentially # spammy 'requeued' requests log.info("Checking story match...") _update_tfu_message(db, topic_fetch_url, "checking story match") if topic_fetch_url['state'] == FETCH_STATE_PENDING: log.info("URL is in pending state, getting story match...") story_match = get_story_match(db=db, url=topic_fetch_url['url']) # try to match the story before doing the expensive fetch if story_match is not None: log.info(f"Matched story {story_match['stories_id']}, returning") topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH topic_fetch_url['code'] = 200 topic_fetch_url['stories_id'] = story_match['stories_id'] return # check whether we want to delay fetching for another job, eg. fetch_twitter_urls log.info("Checking for pending state...") pending_state = _get_pending_state(topic_fetch_url) if pending_state: log.info("URL is in pending state, returning") topic_fetch_url['state'] = pending_state return # get content from either the seed or by fetching it log.info("Checking seeded content...") _update_tfu_message(db, topic_fetch_url, "checking seeded content") response = _get_seeded_content(db, topic_fetch_url) if response is None: log.info("Seeded content found, fetching URL...") _update_tfu_message(db, topic_fetch_url, "fetching content") response = _fetch_url(db, topic_fetch_url['url'], domain_timeout=domain_timeout) log.info(f"{response.code} response returned") else: log.debug(f"Seeded content found for URL: {topic_fetch_url['url']}") content = response.content fetched_url = topic_fetch_url['url'] response_url = response.last_requested_url if fetched_url != response_url: log.info( f"Fetched URL {fetched_url} is not the same as response URL {response_url}, testing for ignore link pattern" ) if _ignore_link_pattern(response_url): log.info("Ignore link pattern matched, returning") topic_fetch_url['state'] = FETCH_STATE_IGNORED topic_fetch_url['code'] = 403 return log.info("Checking story match for redirect URL...") _update_tfu_message(db, topic_fetch_url, "checking story match for redirect_url") story_match = get_story_match(db=db, url=fetched_url, redirect_url=response_url) topic_fetch_url['code'] = response.code assume_match = topic_fetch_url['assume_match'] log.info("Checking content match...") _update_tfu_message(db, topic_fetch_url, "checking content match") if not response.is_success: log.info("Request failed") topic_fetch_url['state'] = FETCH_STATE_REQUEST_FAILED topic_fetch_url['message'] = response.message elif story_match is not None: log.info(f"Story {story_match['stories_id']} matched") topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH topic_fetch_url['stories_id'] = story_match['stories_id'] elif not content_matches_topic( content=content, topic=topic, assume_match=assume_match): log.info("Content matched") topic_fetch_url['state'] = FETCH_STATE_CONTENT_MATCH_FAILED else: log.info("Nothing matched, generating story...") try: _update_tfu_message(db, topic_fetch_url, "generating story") url = response_url if response_url is not None else fetched_url log.info("Creating story...") story = generate_story(db=db, content=content, url=url) log.info(f"Created story {story['stories_id']}") topic_fetch_url['stories_id'] = story['stories_id'] topic_fetch_url['state'] = FETCH_STATE_STORY_ADDED except McTMStoriesDuplicateException: log.info( "Duplicate story found, checking for story match on unique constraint error..." ) # may get a unique constraint error for the story addition within the media source. that's fine # because it means the story is already in the database and we just need to match it again. _update_tfu_message( db, topic_fetch_url, "checking for story match on unique constraint error") topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH story_match = get_story_match(db=db, url=fetched_url, redirect_url=response_url) if story_match is None: message = "Unable to find matching story after unique constraint error." log.error(message) raise McTMFetchLinkException(message) log.info(f"Matched story {story_match['stories_id']}") topic_fetch_url['stories_id'] = story_match['stories_id'] log.info("Done generating story") _update_tfu_message(db, topic_fetch_url, "_try_fetch_url done") log.info(f"Done trying to fetch topic URL {topic_fetch_url['url']}.")
def _try_fetch_topic_url(db: DatabaseHandler, topic_fetch_url: dict, domain_timeout: Optional[int] = None) -> None: """Implement the logic of fetch_topic_url without the try: or the topic_fetch_url update.""" log.warning("_try_fetch_topic_url: %s" % topic_fetch_url['url']) # don't reprocess already processed urls if topic_fetch_url['state'] not in (FETCH_STATE_PENDING, FETCH_STATE_REQUEUED): return _update_tfu_message(db, topic_fetch_url, "checking ignore links") if _ignore_link_pattern(topic_fetch_url['url']): topic_fetch_url['state'] = FETCH_STATE_IGNORED topic_fetch_url['code'] = 403 return _update_tfu_message(db, topic_fetch_url, "checking failed url") failed_url = _get_failed_url(db, topic_fetch_url['topics_id'], topic_fetch_url['url']) if failed_url: topic_fetch_url['state'] = failed_url['state'] topic_fetch_url['code'] = failed_url['code'] topic_fetch_url['message'] = failed_url['message'] return _update_tfu_message(db, topic_fetch_url, "checking self linked domain") if skip_self_linked_domain(db, topic_fetch_url): topic_fetch_url['state'] = FETCH_STATE_SKIPPED topic_fetch_url['code'] = 403 return topic = db.require_by_id('topics', topic_fetch_url['topics_id']) topic_fetch_url['fetch_date'] = datetime.datetime.now() story_match = None # this match is relatively expensive, so only do it on the first 'pending' request and not the potentially # spammy 'requeued' requests _update_tfu_message(db, topic_fetch_url, "checking story match") if topic_fetch_url['state'] == FETCH_STATE_PENDING: story_match = get_story_match(db=db, url=topic_fetch_url['url']) # try to match the story before doing the expensive fetch if story_match is not None: topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH topic_fetch_url['code'] = 200 topic_fetch_url['stories_id'] = story_match['stories_id'] return # check whether we want to delay fetching for another job, eg. fetch_twitter_urls pending_state = _get_pending_state(topic_fetch_url) if pending_state: topic_fetch_url['state'] = pending_state return # get content from either the seed or by fetching it _update_tfu_message(db, topic_fetch_url, "checking seeded content") response = _get_seeded_content(db, topic_fetch_url) if response is None: _update_tfu_message(db, topic_fetch_url, "fetching content") response = _fetch_url(db, topic_fetch_url['url'], domain_timeout=domain_timeout) log.debug("%d response returned for url: %s" % (response.code, topic_fetch_url['url'])) else: log.debug("seeded content found for url: %s" % topic_fetch_url['url']) content = response.content fetched_url = topic_fetch_url['url'] response_url = response.last_requested_url if fetched_url != response_url: if _ignore_link_pattern(response_url): topic_fetch_url['state'] = FETCH_STATE_IGNORED topic_fetch_url['code'] = 403 return _update_tfu_message(db, topic_fetch_url, "checking story match for redirect_url") story_match = get_story_match(db=db, url=fetched_url, redirect_url=response_url) topic_fetch_url['code'] = response.code assume_match = topic_fetch_url['assume_match'] _update_tfu_message(db, topic_fetch_url, "checking content match") if not response.is_success: topic_fetch_url['state'] = FETCH_STATE_REQUEST_FAILED topic_fetch_url['message'] = response.message elif story_match is not None: topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH topic_fetch_url['stories_id'] = story_match['stories_id'] elif not content_matches_topic( content=content, topic=topic, assume_match=assume_match): topic_fetch_url['state'] = FETCH_STATE_CONTENT_MATCH_FAILED else: try: _update_tfu_message(db, topic_fetch_url, "generating story") url = response_url if response_url is not None else fetched_url story = generate_story(db=db, content=content, url=url) topic_fetch_url['stories_id'] = story['stories_id'] topic_fetch_url['state'] = FETCH_STATE_STORY_ADDED except McTMStoriesDuplicateException: # may get a unique constraint error for the story addition within the media source. that's fine # because it means the story is already in the database and we just need to match it again. _update_tfu_message( db, topic_fetch_url, "checking for story match on unique constraint error") topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH story_match = get_story_match(db=db, url=fetched_url, redirect_url=response_url) if story_match is None: raise McTMFetchLinkException( "Unable to find matching story after unique constraint error." ) topic_fetch_url['stories_id'] = story_match['stories_id'] _update_tfu_message(db, topic_fetch_url, "_try_fetch_url done")
def test_generate_story(): """Test generate_story().""" db = connect_to_db() story_content = '<title>foo bar</title><meta content="2016-01-12T03:55:46Z" itemprop="datePublished"/>' story_url = 'http://foo.com/foo/bar' story = generate_story(db=db, url=story_url, content=story_content) assert 'stories_id' in story assert story['title'] == 'foo bar' assert story['publish_date'] == '2016-01-12 03:55:46' assert story['url'] == story_url assert story['guid'] == story_url medium = db.require_by_id('media', story['media_id']) assert medium['name'] == 'foo.com' assert medium['url'] == 'http://foo.com/' feed = db.query( """ SELECT f.* FROM feeds_stories_map AS fsm INNER JOIN feeds AS f ON fsm.feeds_id = f.feeds_id WHERE fsm.stories_id = %(stories_id)s """, { 'stories_id': story['stories_id'], }).hash() assert feed is not None assert feed['name'] == SPIDER_FEED_NAME (date_tag, date_tag_set) = get_story_date_tag(db, story) assert date_tag['tag'] == 'guess_by_tag_meta' assert date_tag_set['name'] == GUESS_METHOD_TAG_SET download = db.query( """ SELECT * FROM downloads WHERE stories_id = %(stories_id)s """, { 'stories_id': story['stories_id'], }).hash() assert download is not None assert download['url'] == story['url'] content = fetch_content(db, download) assert content == story_content story = generate_story( db=db, url='http://fallback.date', content='foo', ) assert story['publish_date'] == None matched_story = generate_story(db, story['url'], 'foo') assert matched_story['stories_id'] == story['stories_id'] story = generate_story(db=db, url='invalid url', content='foo') assert story is not None