def assign_date_guess_tag( db: DatabaseHandler, story: dict, date_guess: GuessDateResult, fallback_date: Optional[str]) -> None: """Assign a guess method tag to the story based on the date_guess result. If date_guess found a result, assign a date_guess_method:guess_by_url, guess_by_tag_*, or guess_by_unknown tag. Otherwise if there is a fallback_date, assign date_guess_metehod:fallback_date. Else assign date_invalid:date_invalid. Arguments: db - db handle story - story dict from db date_guess - GuessDateResult from guess_date() call Returns: None """ if date_guess.found: tag_set = GUESS_METHOD_TAG_SET guess_method = date_guess.guess_method if guess_method.startswith('Extracted from url'): tag = 'guess_by_url' elif guess_method.startswith('Extracted from tag'): match = re2.search(r'\<(\w+)', guess_method) html_tag = match.group(1) if match is not None else 'unknown' tag = 'guess_by_tag_' + str(html_tag) else: tag = 'guess_by_unknown' elif fallback_date is not None: tag_set = GUESS_METHOD_TAG_SET tag = 'fallback_date' else: tag_set = INVALID_TAG_SET tag = INVALID_TAG ts = db.find_or_create('tag_sets', {'name': tag_set}) t = db.find_or_create('tags', {'tag': tag, 'tag_sets_id': ts['tag_sets_id']}) stories_id = story['stories_id'] tags_id = t['tags_id'] db.query( """ DELETE FROM stories_tags_map WHERE stories_id = %(stories_id)s """, {'stories_id': stories_id} ) db.query(""" INSERT INTO stories_tags_map (stories_id, tags_id) VALUES (%(stories_id)s, %(tags_id)s) ON CONFLICT (stories_id, tags_id) DO NOTHING """, { 'stories_id': stories_id, 'tags_id': tags_id, })
def add_story(db: DatabaseHandler, story: dict, feeds_id: int, skip_checking_if_new: bool = False) -> Optional[dict]: """If the story is new, add story to the database with the feed of the download as story feed. Returns created story or None if story wasn't created. """ story = decode_object_from_bytes_if_needed(story) if isinstance(feeds_id, bytes): feeds_id = decode_object_from_bytes_if_needed(feeds_id) feeds_id = int(feeds_id) if isinstance(skip_checking_if_new, bytes): skip_checking_if_new = decode_object_from_bytes_if_needed(skip_checking_if_new) skip_checking_if_new = bool(int(skip_checking_if_new)) if db.in_transaction(): raise McAddStoryException("add_story() can't be run from within transaction.") db.begin() db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE") if not skip_checking_if_new: if not is_new(db=db, story=story): log.debug("Story '{}' is not new.".format(story['url'])) db.commit() return None medium = db.find_by_id(table='media', object_id=story['media_id']) if story.get('full_text_rss', None) is None: story['full_text_rss'] = medium.get('full_text_rss', False) or False if len(story.get('description', '')) == 0: story['full_text_rss'] = False try: story = db.create(table='stories', insert_hash=story) except Exception as ex: db.rollback() # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint if 'unique constraint \"stories_guid' in str(ex): log.warning( "Failed to add story for '{}' to GUID conflict (guid = '{}')".format(story['url'], story['guid']) ) return None else: raise McAddStoryException("Error adding story: {}\nStory: {}".format(str(ex), str(story))) db.find_or_create( table='feeds_stories_map', insert_hash={ 'stories_id': story['stories_id'], 'feeds_id': feeds_id, } ) db.commit() return story
def update_extractor_version_tag(db: DatabaseHandler, stories_id: int, extractor_version: str) -> None: """Add extractor version tag to the story.""" # FIXME no caching because unit tests run in the same process so a cached tag set / tag will not be recreated. # Purging such a cache manually is very error-prone. if isinstance(stories_id, bytes): stories_id = decode_object_from_bytes_if_needed(stories_id) stories_id = int(stories_id) extractor_version = decode_object_from_bytes_if_needed(extractor_version) tag_set = db.find_or_create(table='tag_sets', insert_hash={'name': extractor_version_tag_sets_name()}) db.query(""" DELETE FROM stories_tags_map AS stm USING tags AS t JOIN tag_sets AS ts ON ts.tag_sets_id = t.tag_sets_id WHERE t.tags_id = stm.tags_id AND ts.tag_sets_id = %(tag_sets_id)s AND stm.stories_id = %(stories_id)s """, { 'tag_sets_id': tag_set['tag_sets_id'], 'stories_id': stories_id, }) tag = db.find_or_create(table='tags', insert_hash={'tag': extractor_version, 'tag_sets_id': tag_set['tag_sets_id']}) tags_id = tag['tags_id'] db.query(""" INSERT INTO stories_tags_map (stories_id, tags_id) VALUES (%(stories_id)s, %(tags_id)s) """, {'stories_id': stories_id, 'tags_id': tags_id})
def update_extractor_version_tag(db: DatabaseHandler, story: dict) -> None: """Add extractor version tag to the story.""" # FIXME no caching because unit tests run in the same process so a cached tag set / tag will not be recreated. # Purging such a cache manually is very error-prone. story = decode_object_from_bytes_if_needed(story) tag_set = db.find_or_create(table='tag_sets', insert_hash={'name': extractor_version_tag_sets_name()}) db.query(""" DELETE FROM stories_tags_map AS stm USING tags AS t JOIN tag_sets AS ts ON ts.tag_sets_id = t.tag_sets_id WHERE t.tags_id = stm.tags_id AND ts.tag_sets_id = %(tag_sets_id)s AND stm.stories_id = %(stories_id)s """, { 'tag_sets_id': tag_set['tag_sets_id'], 'stories_id': story['stories_id'], }) extractor_version = extractor_name() tag = db.find_or_create(table='tags', insert_hash={'tag': extractor_version, 'tag_sets_id': tag_set['tag_sets_id']}) tags_id = tag['tags_id'] db.query(""" INSERT INTO stories_tags_map (stories_id, tags_id) VALUES (%(stories_id)s, %(tags_id)s) """, {'stories_id': story['stories_id'], 'tags_id': tags_id})
def guess_medium(db: DatabaseHandler, story_url: str) -> dict: """Guess the media source for a story with the given url. The guess is based on a normalized version of the host part of the url. The guess takes into account the duplicate media relationships included in the postgres database through the media.dup_media_id fields. If no appropriate media source exists, this function will create a new one and return it. """ (medium_url, medium_name) = generate_medium_url_and_name_from_url(story_url) medium = lookup_medium(db, medium_url, medium_name) if medium is not None: return medium normalized_medium_url = _normalize_url(medium_url) normalized_story_url = _normalize_url(story_url) all_urls = [ normalized_medium_url, medium_url, normalized_story_url, story_url ] # avoid conflicts with existing media names and urls that are missed # by the above query b/c of dups feeds or foreign_rss_links medium_name = get_unique_medium_name(db, [medium_name] + all_urls) medium_url = get_unique_medium_url(db, all_urls) # a race condition with another thread can cause this to fail sometimes, but after the medium in the # other process has been created, all should be fine for i in range(_GUESS_MEDIUM_RETRIES): medium_data = { 'name': medium_name, 'url': medium_url, 'normalized_url': normalized_medium_url } medium = db.find_or_create('media', medium_data) if medium is not None: break else: time.sleep(1) if medium is None: raise McTopicMediaUniqueException( "Unable to find or create medium for %s / %s" % (medium_name, medium_url)) log.info("add medium: %s / %s / %d" % (medium_name, medium_url, medium['media_id'])) spidered_tag = get_spidered_tag(db) db.find_or_create('media_tags_map', { 'media_id': medium['media_id'], 'tags_id': spidered_tag['tags_id'] }) return medium
def assign_date_guess_tag(db: DatabaseHandler, story: dict, date_guess: GuessDateResult, fallback_date: typing.Optional[str]) -> None: """Assign a guess method tag to the story based on the date_guess result. If date_guess found a result, assign a date_guess_method:guess_by_url, guess_by_tag_*, or guess_by_uknown tag. Otherwise if there is a fallback_date, assign date_guess_metehod:fallback_date. Else assign date_invalid:date_invalid. Arguments: db - db handle story - story dict from db date_guess - GuessDateResult from guess_date() call Returns: None """ if date_guess.found: tag_set = mediawords.tm.guess_date.GUESS_METHOD_TAG_SET guess_method = date_guess.guess_method if guess_method.startswith('Extracted from url'): tag = 'guess_by_url' elif guess_method.startswith('Extracted from tag'): match = re2.search(r'\<(\w+)', guess_method) html_tag = match.group(1) if match is not None else 'unknown' tag = 'guess_by_tag_' + str(html_tag) else: tag = 'guess_by_unknown' elif fallback_date is not None: tag_set = mediawords.tm.guess_date.GUESS_METHOD_TAG_SET tag = 'fallback_date' else: tag_set = mediawords.tm.guess_date.INVALID_TAG_SET tag = mediawords.tm.guess_date.INVALID_TAG ts = db.find_or_create('tag_sets', {'name': tag_set}) t = db.find_or_create('tags', { 'tag': tag, 'tag_sets_id': ts['tag_sets_id'] }) db.query("delete from stories_tags_map where stories_id = %(a)s", {'a': story['stories_id']}) db.query( "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)", { 'a': story['stories_id'], 'b': t['tags_id'] })
def get_spider_feed(db: DatabaseHandler, medium: dict) -> dict: """Find or create the 'Spider Feed' feed for the media source.""" feed = db.query( """ SELECT * FROM feeds WHERE media_id = %(media_id)s AND name = %(name)s """, { 'media_id': medium['media_id'], 'name': SPIDER_FEED_NAME, } ).hash() if feed is not None: return feed return db.find_or_create( 'feeds', { 'media_id': medium['media_id'], 'url': medium['url'] + '#spiderfeed', 'name': SPIDER_FEED_NAME, 'active': False, }, )
def guess_medium(db: DatabaseHandler, story_url: str) -> dict: """Guess the media source for a story with the given url. The guess is based on a normalized version of the host part of the url. The guess takes into account the duplicate media relationships included in the postgres database through the media.dup_media_id fields. If no appropriate media source exists, this function will create a new one and return it. """ (medium_url, medium_name) = generate_medium_url_and_name_from_url(story_url) medium = lookup_medium(db, medium_url, medium_name) if medium is not None: return medium normalized_medium_url = _normalize_url(medium_url) normalized_story_url = _normalize_url(story_url) all_urls = [normalized_medium_url, medium_url, normalized_story_url, story_url] # avoid conflicts with existing media names and urls that are missed # by the above query b/c of dups feeds or foreign_rss_links medium_name = get_unique_medium_name(db, [medium_name] + all_urls) medium_url = get_unique_medium_url(db, all_urls) # a race condition with another thread can cause this to fail sometimes, but after the medium in the # other process has been created, all should be fine for i in range(_GUESS_MEDIUM_RETRIES): medium_data = {'name': medium_name, 'url': medium_url, 'normalized_url': normalized_medium_url} medium = db.find_or_create('media', medium_data) if medium is not None: break else: time.sleep(1) if medium is None: raise McTopicMediaUniqueException( "Unable to find or create medium for %s / %s" % (medium_name, medium_url)) log.info("add medium: %s / %s / %d" % (medium_name, medium_url, medium['media_id'])) spidered_tag = get_spidered_tag(db) db.find_or_create('media_tags_map', {'media_id': medium['media_id'], 'tags_id': spidered_tag['tags_id']}) return medium
def assign_date_guess_tag( db: DatabaseHandler, story: dict, date_guess: GuessDateResult, fallback_date: typing.Optional[str]) -> None: """Assign a guess method tag to the story based on the date_guess result. If date_guess found a result, assign a date_guess_method:guess_by_url, guess_by_tag_*, or guess_by_uknown tag. Otherwise if there is a fallback_date, assign date_guess_metehod:fallback_date. Else assign date_invalid:date_invalid. Arguments: db - db handle story - story dict from db date_guess - GuessDateResult from guess_date() call Returns: None """ if date_guess.found: tag_set = mediawords.tm.guess_date.GUESS_METHOD_TAG_SET guess_method = date_guess.guess_method if guess_method.startswith('Extracted from url'): tag = 'guess_by_url' elif guess_method.startswith('Extracted from tag'): match = re2.search(r'\<(\w+)', guess_method) html_tag = match.group(1) if match is not None else 'unknown' tag = 'guess_by_tag_' + str(html_tag) else: tag = 'guess_by_unknown' elif fallback_date is not None: tag_set = mediawords.tm.guess_date.GUESS_METHOD_TAG_SET tag = 'fallback_date' else: tag_set = mediawords.tm.guess_date.INVALID_TAG_SET tag = mediawords.tm.guess_date.INVALID_TAG ts = db.find_or_create('tag_sets', {'name': tag_set}) t = db.find_or_create('tags', {'tag': tag, 'tag_sets_id': ts['tag_sets_id']}) db.query("delete from stories_tags_map where stories_id = %(a)s", {'a': story['stories_id']}) db.query( "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)", {'a': story['stories_id'], 'b': t['tags_id']})
def get_spidered_tag(db: DatabaseHandler) -> dict: """Return the spidered:spidered tag dict.""" spidered_tag = db.query( """ select t.* from tags t join tag_sets ts using ( tag_sets_id ) where t.tag = %(a)s and ts.name = %(b)s """, {'a': SPIDERED_TAG_TAG, 'b': SPIDERED_TAG_SET}).hash() if spidered_tag is None: tag_set = db.find_or_create('tag_sets', {'name': SPIDERED_TAG_SET}) spidered_tag = db.find_or_create('tags', {'tag': SPIDERED_TAG_TAG, 'tag_sets_id': tag_set['tag_sets_id']}) return spidered_tag
def _import_ap_story(db: DatabaseHandler, ap_story: dict) -> None: """Given a ap story return by get_new_stories(), add it to the database.""" ap_medium = db.query( """ SELECT * FROM media WHERE name = %(medium_name)s """, { 'medium_name': AP_MEDIUM_NAME, }).hash() ap_feed = { 'media_id': ap_medium['media_id'], 'name': 'API Feed', 'active': False, 'type': 'syndicated', 'url': 'http://ap.com' } ap_feed = db.find_or_create('feeds', ap_feed) story = { 'guid': ap_story['guid'], 'url': ap_story['url'], 'publish_date': ap_story['publish_date'], 'title': ap_story['title'], 'description': ap_story['description'], 'media_id': ap_medium['media_id'] } story = add_story(db, story, ap_feed['feeds_id']) if not story: return story_download = create_download_for_new_story(db, story, ap_feed) download_text = { 'downloads_id': story_download['downloads_id'], 'download_text': ap_story['text'], 'download_text_length': len(ap_story['text']) } db.query( """ INSERT INTO download_texts (downloads_id, download_text, download_text_length) VALUES (%(downloads_id)s, %(download_text)s, %(download_text_length)s) """, download_text) # Send to the extractor for it to do vectorization, language detection, etc. JobBroker(queue_name='MediaWords::Job::ExtractAndVector').add_to_queue( stories_id=story['stories_id'], use_existing=True, )
def get_spidered_tag(db: DatabaseHandler) -> dict: """Return the spidered:spidered tag dict.""" spidered_tag = db.query( """ select t.* from tags t join tag_sets ts using ( tag_sets_id ) where t.tag = %(a)s and ts.name = %(b)s """, { 'a': SPIDERED_TAG_TAG, 'b': SPIDERED_TAG_SET }).hash() if spidered_tag is None: tag_set = db.find_or_create('tag_sets', {'name': SPIDERED_TAG_SET}) spidered_tag = db.find_or_create('tags', { 'tag': SPIDERED_TAG_TAG, 'tag_sets_id': tag_set['tag_sets_id'] }) return spidered_tag
def get_spider_feed(db: DatabaseHandler, medium: dict) -> dict: """Find or create the 'Spider Feed' feed for the media source.""" feed = db.query( "select * from feeds where media_id = %(a)s and name = %(b)s", {'a': medium['media_id'], 'b': SPIDER_FEED_NAME}).hash() if feed is not None: return feed return db.find_or_create('feeds', { 'media_id': medium['media_id'], 'url': medium['url'] + '#spiderfeed', 'name': SPIDER_FEED_NAME, 'feed_status': 'inactive' })
def get_spider_feed(db: DatabaseHandler, medium: dict) -> dict: """Find or create the 'Spider Feed' feed for the media source.""" feed = db.query( "select * from feeds where media_id = %(a)s and name = %(b)s", {'a': medium['media_id'], 'b': SPIDER_FEED_NAME}).hash() if feed is not None: return feed return db.find_or_create('feeds', { 'media_id': medium['media_id'], 'url': medium['url'] + '#spiderfeed', 'name': SPIDER_FEED_NAME, 'active': False, })
def add_story(db: DatabaseHandler, story: dict, feeds_id: int, skip_checking_if_new: bool = False) -> Optional[dict]: """If the story is new, add story to the database with the feed of the download as story feed. Returns created story or None if story wasn't created. """ story = decode_object_from_bytes_if_needed(story) if isinstance(feeds_id, bytes): feeds_id = decode_object_from_bytes_if_needed(feeds_id) feeds_id = int(feeds_id) if isinstance(skip_checking_if_new, bytes): skip_checking_if_new = decode_object_from_bytes_if_needed( skip_checking_if_new) skip_checking_if_new = bool(int(skip_checking_if_new)) if db.in_transaction(): raise McAddStoryException( "add_story() can't be run from within transaction.") db.begin() db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE") if not skip_checking_if_new: if not is_new(db=db, story=story): log.debug("Story '{}' is not new.".format(story['url'])) db.commit() return None medium = db.find_by_id(table='media', object_id=story['media_id']) if story.get('full_text_rss', None) is None: story['full_text_rss'] = medium.get('full_text_rss', False) or False if len(story.get('description', '')) == 0: story['full_text_rss'] = False try: story = db.create(table='stories', insert_hash=story) except Exception as ex: db.rollback() # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint if 'unique constraint \"stories_guid' in str(ex): log.warning( "Failed to add story for '{}' to GUID conflict (guid = '{}')". format(story['url'], story['guid'])) return None else: raise McAddStoryException( "Error adding story: {}\nStory: {}".format( str(ex), str(story))) db.find_or_create(table='feeds_stories_map', insert_hash={ 'stories_id': story['stories_id'], 'feeds_id': feeds_id, }) db.commit() return story