def _create_child_download_for_story(db: DatabaseHandler, story: dict, parent_download: dict) -> None: """Create a pending download for the story's URL.""" story = decode_object_from_bytes_if_needed(story) parent_download = decode_object_from_bytes_if_needed(parent_download) download = { 'feeds_id': parent_download['feeds_id'], 'stories_id': story['stories_id'], 'parent': parent_download['downloads_id'], 'url': story['url'], 'host': get_url_host(story['url']), 'type': 'content', 'sequence': 1, 'state': 'pending', 'priority': parent_download['priority'], 'extracted': False, } content_delay = db.query(""" SELECT content_delay FROM media WHERE media_id = %(media_id)s """, {'media_id': story['media_id']}).flat()[0] if content_delay: # Delay download of content this many hours. his is useful for sources that are likely to significantly change # content in the hours after it is first published. now = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) download_at_timestamp = now + (content_delay * 60 * 60) download['download_time'] = get_sql_date_from_epoch(download_at_timestamp) db.create(table='downloads', insert_hash=download)
def create_test_story(db: DatabaseHandler, label: str, feed: dict) -> dict: """Create test story with a simple label belonging to feed.""" label = decode_object_from_bytes_if_needed(label) feed = decode_object_from_bytes_if_needed(feed) story = db.create(table='stories', insert_hash={ 'media_id': int(feed['media_id']), 'url': "http://story.test/%s" % label, 'guid': "guid://story.test/%s" % label, 'title': "story %s" % label, 'description': "description %s" % label, 'publish_date': '2016-10-15 08:00:00', 'collect_date': '2016-10-15 10:00:00', 'full_text_rss': True, }) db.create(table='feeds_stories_map', insert_hash={ 'feeds_id': int(feed['feeds_id']), 'stories_id': int(story['stories_id']), }) return story
def validate_remote_integration(db: DatabaseHandler, source: str, query: str, day: str) -> None: """Run sanity test on remote APIs.""" topic = create_test_topic(db, "test_remote_integration") tsq = { 'topics_id': topic['topics_id'], 'platform': 'twitter', 'source': source, 'query': query } db.create('topic_seed_queries', tsq) topic['platform'] = 'twitter' topic['pattern'] = '.*' topic['start_date'] = day topic['end_date'] = day topic['mode'] = 'url_sharing' db.update_by_id('topics', topic['topics_id'], topic) fetch_topic_posts(db, topic['topics_id']) got_tts = db.query("select * from topic_posts").hashes() # for old ch monitors, lots of the posts may be deleted assert len(got_tts) > 20 assert len(got_tts[0]['content']) > MIN_TEST_POST_LENGTH assert len(got_tts[0]['author']) > MIN_TEST_AUTHOR_LENGTH
def _add_timespans_to_stories(db: DatabaseHandler, stories: List[Dict[str, Any]]) -> None: """Add timespans to stories for solr indexing.""" stories = decode_object_from_bytes_if_needed(stories) topic = create_test_topic(db=db, label="solr dump test") snapshot = db.create(table='snapshots', insert_hash={ 'topics_id': topic['topics_id'], 'snapshot_date': '2018-01-01', 'start_date': '2018-01-01', 'end_date': '2018-01-01', }) timespans = [] for i in range(1, 5 + 1): timespan = db.create(table='timespans', insert_hash={ 'topics_id': topic['topics_id'], 'snapshots_id': snapshot['snapshots_id'], 'start_date': '2018-01-01', 'end_date': '2018-01-01', 'story_count': 1, 'story_link_count': 1, 'medium_count': 1, 'medium_link_count': 1, 'post_count': 1, 'period': 'overall', }) timespans.append(timespan) for story in stories: assert isinstance(story, dict) timespan = timespans.pop() timespans.insert(0, timespan) db.query( """ INSERT INTO snap.story_link_counts ( topics_id, timespans_id, stories_id, media_inlink_count, inlink_count, outlink_count ) VALUES ( %(topics_id)s, %(timespans_id)s, %(stories_id)s, 1, 1, 1 ) """, { 'topics_id': timespan['topics_id'], 'timespans_id': timespan['timespans_id'], 'stories_id': story['stories_id'], })
def merge_foreign_rss_stories(db: DatabaseHandler, topic: dict) -> None: """Move all topic stories with a foreign_rss_links medium from topic_stories back to topic_seed_urls.""" topic = decode_object_from_bytes_if_needed(topic) stories = db.query( """ select s.* from stories s, topic_stories ts, media m where s.stories_id = ts.stories_id and s.media_id = m.media_id and m.foreign_rss_links = true and ts.topics_id = %(a)s and not ts.valid_foreign_rss_story """, { 'a': topic['topics_id'] }).hashes() for story in stories: download = db.query( "select * from downloads where stories_id = %(a)s order by downloads_id limit 1", { 'a': story['stories_id'] }).hash() content = '' try: content = fetch_content(db, download) except Exception as ex: log.warning( f"Unable to fetch content for download {download['downloads_id']}: {ex}" ) # postgres will complain if the content has a null in it content = content.replace('\x00', '') db.begin() db.create( 'topic_seed_urls', { 'url': story['url'], 'topics_id': topic['topics_id'], 'source': 'merge_foreign_rss_stories', 'content': content }) db.query( """ update topic_links set ref_stories_id = null, link_spidered = 'f' where topics_id = %(b)s and ref_stories_id = %(a)s """, { 'a': story['stories_id'], 'b': topic['topics_id'] }) db.query( "delete from topic_stories where stories_id = %(a)s and topics_id = %(b)s", { 'a': story['stories_id'], 'b': topic['topics_id'] }) db.commit()
def get_consistent_color(db: DatabaseHandler, item_set: str, item_id: str) -> str: """Return the same hex color (e.g. "ff0000" for the same set / ID combination every time this function is called.""" item_set = decode_object_from_bytes_if_needed(item_set) item_id = decode_object_from_bytes_if_needed(item_id) # Always return grey for null or not typed values if item_id.lower() in {'null', 'not typed'}: return '999999' color = db.query( """SELECT color FROM color_sets WHERE color_set = %(item_set)s AND id = %(item_id)s""", { 'item_set': item_set, 'item_id': item_id, }).flat() if color is not None and len(color): if isinstance(color, list): color = color[0] return color set_colors = db.query( """SELECT color FROM color_sets WHERE color_set = %(item_set)s""", { 'item_set': item_set, }).flat() if set_colors is not None: if not isinstance(set_colors, list): set_colors = [set_colors] existing_colors = set() if set_colors is not None: for color in set_colors: existing_colors.add(color) # Use the hard coded palette of 25 colors if possible new_color = None for color in __MC_COLORS: if color not in existing_colors: new_color = color break # Otherwise, just generate a random color if new_color is None: colors = analogous_color(color='0000ff', return_slices=256, split_slices=255) new_color = random.choice(colors) db.create(table='color_sets', insert_hash={ 'color_set': item_set, 'id': item_id, 'color': new_color, }) return new_color
def extract_links_for_topic_story(db: DatabaseHandler, story: dict, topic: dict) -> None: """ Extract links from a story and insert them into the topic_links table for the given topic. After the story is processed, set topic_stories.spidered to true for that story. Calls get_links_from_story on each story. Almost all errors are caught by this function saved in topic_stories.link_mine_error. In the case of an error topic_stories.link_mined is also set to true. Arguments: db - db handle story - story dict from db topic - topic dict from db Returns: None """ try: log.info("mining %s %s for topic %s .." % (story['title'], story['url'], topic['name'])) links = get_links_from_story(db, story) for link in links: if mediawords.tm.domains.skip_self_linked_domain_url( db, topic['topics_id'], story['url'], link): log.info("skipping self linked domain url...") continue topic_link = { 'topics_id': topic['topics_id'], 'stories_id': story['stories_id'], 'url': link } db.create('topic_links', topic_link) mediawords.tm.domains.increment_domain_links(db, topic_link) link_mine_error = '' except Exception: link_mine_error = traceback.format_exc() db.query( """ update topic_stories set link_mined = 't', link_mine_error = %(c)s where stories_id = %(a)s and topics_id = %(b)s """, { 'a': story['stories_id'], 'b': topic['topics_id'], 'c': link_mine_error })
def get_consistent_color(db: DatabaseHandler, item_set: str, item_id: str) -> str: """Return the same hex color (e.g. "ff0000" for the same set / ID combination every time this function is called.""" item_set = decode_object_from_bytes_if_needed(item_set) item_id = decode_object_from_bytes_if_needed(item_id) # Always return grey for null or not typed values if item_id.lower() in {'null', 'not typed'}: return '999999' color = db.query("""SELECT color FROM color_sets WHERE color_set = %(item_set)s AND id = %(item_id)s""", { 'item_set': item_set, 'item_id': item_id, }).flat() if color is not None and len(color): if isinstance(color, list): color = color[0] return color set_colors = db.query("""SELECT color FROM color_sets WHERE color_set = %(item_set)s""", { 'item_set': item_set, }).flat() if set_colors is not None: if not isinstance(set_colors, list): set_colors = [set_colors] existing_colors = set() if set_colors is not None: for color in set_colors: existing_colors.add(color) # Use the hard coded palette of 25 colors if possible new_color = None for color in __MC_COLORS: if color not in existing_colors: new_color = color break # Otherwise, just generate a random color if new_color is None: colors = analogous_color(color='0000ff', return_slices=256, split_slices=255) new_color = random.choice(colors) db.create(table='color_sets', insert_hash={ 'color_set': item_set, 'id': item_id, 'color': new_color, }) return new_color
def add_story(db: DatabaseHandler, story: dict, feeds_id: int, skip_checking_if_new: bool = False) -> Optional[dict]: """If the story is new, add story to the database with the feed of the download as story feed. Returns created story or None if story wasn't created. """ story = decode_object_from_bytes_if_needed(story) if isinstance(feeds_id, bytes): feeds_id = decode_object_from_bytes_if_needed(feeds_id) feeds_id = int(feeds_id) if isinstance(skip_checking_if_new, bytes): skip_checking_if_new = decode_object_from_bytes_if_needed(skip_checking_if_new) skip_checking_if_new = bool(int(skip_checking_if_new)) if db.in_transaction(): raise McAddStoryException("add_story() can't be run from within transaction.") db.begin() db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE") if not skip_checking_if_new: if not is_new(db=db, story=story): log.debug("Story '{}' is not new.".format(story['url'])) db.commit() return None medium = db.find_by_id(table='media', object_id=story['media_id']) if story.get('full_text_rss', None) is None: story['full_text_rss'] = medium.get('full_text_rss', False) or False if len(story.get('description', '')) == 0: story['full_text_rss'] = False try: story = db.create(table='stories', insert_hash=story) except Exception as ex: db.rollback() # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint if 'unique constraint \"stories_guid' in str(ex): log.warning( "Failed to add story for '{}' to GUID conflict (guid = '{}')".format(story['url'], story['guid']) ) return None else: raise McAddStoryException("Error adding story: {}\nStory: {}".format(str(ex), str(story))) db.find_or_create( table='feeds_stories_map', insert_hash={ 'stories_id': story['stories_id'], 'feeds_id': feeds_id, } ) db.commit() return story
def _store_tweet_and_urls(db: DatabaseHandler, topic_tweet_day: dict, ch_post: dict) -> None: """ Store the tweet in topic_tweets and its urls in topic_tweet_urls, using the data in ch_post. Arguments: db - database handler topic - topic dict topic_tweet_day - topic_tweet_day dict ch_post - ch_post dict Return: None """ data_json = mediawords.util.parse_json.encode_json(ch_post) # null characters are not legal in json but for some reason get stuck in these tweets data_json = data_json.replace('\x00', '') topic_tweet = { 'topic_tweet_days_id': topic_tweet_day['topic_tweet_days_id'], 'data': data_json, 'content': ch_post['tweet']['text'], 'tweet_id': ch_post['tweet_id'], 'publish_date': ch_post['tweet']['created_at'], 'twitter_user': ch_post['tweet']['user']['screen_name'] } topic_tweet = db.create('topic_tweets', topic_tweet) urls = mediawords.util.twitter.get_tweet_urls(ch_post['tweet']) _insert_tweet_urls(db, topic_tweet, urls)
def extract_links_for_topic_story(db: DatabaseHandler, story: dict, topic: dict) -> None: """ Extract links from a story and insert them into the topic_links table for the given topic. After the story is processed, set topic_stories.spidered to true for that story. Calls get_links_from_story on each story. Almost all errors are caught by this function saved in topic_stories.link_mine_error. In the case of an error topic_stories.link_mined is also set to true. Arguments: db - db handle story - story dict from db topic - topic dict from db Returns: None """ try: log.info("mining %s %s for topic %s .." % (story['title'], story['url'], topic['name'])) links = get_links_from_story(db, story) for link in links: if mediawords.tm.domains.skip_self_linked_domain_url(db, topic['topics_id'], story['url'], link): log.info("skipping self linked domain url...") continue topic_link = { 'topics_id': topic['topics_id'], 'stories_id': story['stories_id'], 'url': link } db.create('topic_links', topic_link) mediawords.tm.domains.increment_domain_links(db, topic_link) link_mine_error = '' except Exception: link_mine_error = traceback.format_exc() db.query( """ update topic_stories set link_mined = 't', link_mine_error = %(c)s where stories_id = %(a)s and topics_id = %(b)s """, {'a': story['stories_id'], 'b': topic['topics_id'], 'c': link_mine_error})
def merge_foreign_rss_stories(db: DatabaseHandler, topic: dict) -> None: """Move all topic stories with a foreign_rss_links medium from topic_stories back to topic_seed_urls.""" topic = decode_object_from_bytes_if_needed(topic) stories = db.query( """ select s.* from stories s, topic_stories ts, media m where s.stories_id = ts.stories_id and s.media_id = m.media_id and m.foreign_rss_links = true and ts.topics_id = %(a)s and not ts.valid_foreign_rss_story """, { 'a': topic['topics_id'] }).hashes() for story in stories: download = db.query( "select * from downloads where stories_id = %(a)s order by downloads_id limit 1", { 'a': story['stories_id'] }).hash() content = '' try: content = mediawords.dbi.downloads.fetch_content(db, download) except Exception: pass db.begin() db.create( 'topic_seed_urls', { 'url': story['url'], 'topics_id': topic['topics_id'], 'source': 'merge_foreign_rss_stories', 'content': content }) db.query( "delete from topic_stories where stories_id = %(a)s and topics_id = %(b)s", { 'a': story['stories_id'], 'b': topic['topics_id'] }) db.commit()
def import_feed_downloads(db: DatabaseHandler, csv_file: str) -> None: log.info(f"Importing downloads from {csv_file}...") db.begin() with open(csv_file, mode='r', encoding='utf-8') as f: # Guess dialect sample = f.read(1024) sniffer = csv.Sniffer() dialect = sniffer.sniff(sample) f.seek(0) input_csv = csv.DictReader(f, dialect=dialect) n = 1 for download in input_csv: log.info(f"Importing download {n}...") n += 1 raw_download_content = download.get('_raw_download_content', None) if raw_download_content: del raw_download_content['_raw_download_content'] # Cast some columns download['feeds_id'] = int( download.get['feeds_id'] ) if 'feeds_id' in download else None # NULL download['stories_id'] = int( download.get['stories_id'] ) if 'stories_id' in download else None # NULL download['parent'] = int( download.get['parent'] ) if 'parent' in download else None # NULL download['priority'] = int( download.get['priority'] ) if 'priority' in download else 0 # NOT NULL download['sequence'] = int( download.get['sequence'] ) if 'sequence' in download else 0 # NOT NULL download['sequence'] = 't' if download.get('extracted', False) else 'f' # Will be rewritten by handle_download() download['path'] = '' download = db.create(table='downloads', insert_hash=download) # Create mock response to import it response = FakeResponse(content=raw_download_content) handler = handler_for_download(db=db, download=download) handler.store_response(db=db, download=download, response=response) log.info("Committing...") db.commit() log.info(f"Done importing downloads from {csv_file}")
def _add_topic_post_single_day(db: DatabaseHandler, topic_seed_query: dict, num_posts: int, day: datetime) -> dict: """ Add a row to topic_post_day if it does not already exist. Arguments: db - database handle topic_seed_query - topic_seed_query dict day - date to fetch eg '2017-12-30' num_posts - number of posts found for that day Return: None """ # the perl-python layer was segfaulting until I added the str() around day below -hal topic_post_day = db.query( """ SELECT * FROM topic_post_days WHERE topics_id = %(topics_id)s AND topic_seed_queries_id = %(topic_seed_queries_id)s AND day = %(day)s """, { 'topics_id': topic_seed_query['topics_id'], 'topic_seed_queries_id': topic_seed_query['topic_seed_queries_id'], 'day': str(day), } ).hash() if topic_post_day is not None and topic_post_day['posts_fetched']: raise McFetchTopicPostsDataException("tweets already fetched for day " + str(day)) # if we have a ttd but had not finished fetching tweets, delete it and start over if topic_post_day is not None: db.query( """ DELETE FROM topic_post_days WHERE topics_id = %(topics_id)s AND topic_post_days_id = %(topic_post_days_id)s """, { 'topics_id': topic_post_day['topics_id'], 'topic_post_days_id': topic_post_day['topic_post_days_id'], } ) topic_post_day = db.create( 'topic_post_days', { 'topics_id': topic_seed_query['topics_id'], 'topic_seed_queries_id': topic_seed_query['topic_seed_queries_id'], 'day': day, 'num_posts_stored': num_posts, 'num_posts_fetched': num_posts, 'posts_fetched': False }) return topic_post_day
def _store_tweet_and_urls(db: DatabaseHandler, topic_tweet_day: dict, ch_post: dict) -> None: """ Store the tweet in topic_tweets and its urls in topic_tweet_urls, using the data in ch_post. Arguments: db - database handler topic - topic dict topic_tweet_day - topic_tweet_day dict ch_post - ch_post dict Return: None """ data_json = mediawords.util.parse_json.encode_json(ch_post) # null characters are not legal in json but for some reason get stuck in these tweets data_json = data_json.replace(u'\u0000', '') topic_tweet = { 'topic_tweet_days_id': topic_tweet_day['topic_tweet_days_id'], 'data': data_json, 'content': ch_post['tweet']['text'], 'tweet_id': ch_post['tweet_id'], 'publish_date': ch_post['tweet']['created_at'], 'twitter_user': ch_post['tweet']['user']['screen_name'] } topic_tweet = db.create('topic_tweets', topic_tweet) urls_inserted = {} # type:typing.Dict[str, bool] for url_data in ch_post['tweet']['entities']['urls']: url = url_data['expanded_url'] if url in urls_inserted: break urls_inserted[url] = True db.create( 'topic_tweet_urls', { 'topic_tweets_id': topic_tweet['topic_tweets_id'], 'url': url[0:1024] })
def _store_post_and_urls(db: DatabaseHandler, topic_post_day: dict, post: dict) -> None: """ Store the tweet in topic_posts and its urls in topic_post_urls, using the data in post. Arguments: db - database handler topic - topic dict topic_post_day - topic_post_day dict post - post dict Return: None """ log.debug("remove nulls") _remove_json_tree_nulls(post) log.debug("encode json") data_json = encode_json(post) # null characters are not legal in json but for some reason get stuck in these tweets # data_json = data_json.replace('\x00', '') data = {} for field in POST_FIELDS: data[field] = post.get(field, None) data['topics_id'] = topic_post_day['topics_id'] data['topic_post_days_id'] = topic_post_day['topic_post_days_id'] data['data'] = data_json topic_post = db.query( """ SELECT * FROM topic_posts WHERE topics_id = %(topics_id)s AND topic_post_days_id = %(topic_post_days_id)s AND post_id = %(post_id)s::TEXT """, { 'topics_id': topic_post_day['topics_id'], 'topic_post_days_id': topic_post_day['topic_post_days_id'], 'post_id': data['post_id'], } ).hash() if not topic_post: log.debug("insert topic post") topic_post = db.create('topic_posts', data) log.debug("insert tweet urls") _insert_post_urls(db, topic_post, post['urls']) log.debug("done")
def create_test_feed(db: DatabaseHandler, label: str, medium: dict) -> dict: """Create test feed with a simple label belonging to medium.""" label = decode_object_from_bytes_if_needed(label) medium = decode_object_from_bytes_if_needed(medium) return db.create(table='feeds', insert_hash={ 'name': label, 'url': "http://feed.test/%s" % label, 'media_id': int(medium['media_id']), })
def merge_foreign_rss_stories(db: DatabaseHandler, topic: dict) -> None: """Move all topic stories with a foreign_rss_links medium from topic_stories back to topic_seed_urls.""" topic = decode_object_from_bytes_if_needed(topic) stories = db.query( """ select s.* from stories s, topic_stories ts, media m where s.stories_id = ts.stories_id and s.media_id = m.media_id and m.foreign_rss_links = true and ts.topics_id = %(a)s and not ts.valid_foreign_rss_story """, {'a': topic['topics_id']}).hashes() for story in stories: download = db.query( "select * from downloads where stories_id = %(a)s order by downloads_id limit 1", {'a': story['stories_id']}).hash() content = '' try: content = mediawords.dbi.downloads.fetch_content(db, download) except Exception: pass db.begin() db.create('topic_seed_urls', { 'url': story['url'], 'topics_id': topic['topics_id'], 'source': 'merge_foreign_rss_stories', 'content': content }) db.query( "delete from topic_stories where stories_id = %(a)s and topics_id = %(b)s", {'a': story['stories_id'], 'b': topic['topics_id']}) db.commit()
def _add_topic_tweet_single_day( db: DatabaseHandler, topic: dict, day: datetime.datetime, ch_class: typing.Type[AbstractCrimsonHexagon]) -> dict: """ Add a row to topic_tweet_day if it does not already exist. fetch data for new row from CH. Arguments: db - database handle topic - topic dict day - date to fetch eg '2017-12-30' ch_class - AbstractCrimsonHexagon class Return: None """ # the perl-python layer was segfaulting until I added the str() around day below -hal topic_tweet_day = db.query( "select * from topic_tweet_days where topics_id = %(a)s and day = %(b)s", { 'a': topic['topics_id'], 'b': str(day) }).hash() if topic_tweet_day is not None and topic_tweet_day['tweets_fetched']: raise McFetchTopicTweetDateFetchedException( "tweets already fetched for day " + str(day)) # if we have a ttd but had not finished fetching tweets, delete it and start over if topic_tweet_day is not None: db.delete_by_id('topic_tweet_days', topic_tweet_day['topic_tweet_days_id']) ch_posts = ch_class.fetch_posts(topic['ch_monitor_id'], day) tweet_count = ch_posts['totalPostsAvailable'] num_ch_tweets = len(ch_posts['posts']) topic_tweet_day = db.create( 'topic_tweet_days', { 'topics_id': topic['topics_id'], 'day': day, 'tweet_count': tweet_count, 'num_ch_tweets': num_ch_tweets, 'tweets_fetched': False }) topic_tweet_day['ch_posts'] = ch_posts return topic_tweet_day
def create_test_medium(db: DatabaseHandler, label: str) -> dict: """Create test medium with a simple label.""" label = decode_object_from_bytes_if_needed(label) return db.create(table='media', insert_hash={ 'name': label, 'url': "http://media.test/%s" % label, 'moderated': True, 'is_monitored': True, 'public_notes': "%s public notes" % label, 'editor_notes': "%s editor notes" % label, })
def extract_links_for_topic_story(db: DatabaseHandler, story: dict, topic: dict) -> None: """ Extract links from a story and insert them into the topic_links table for the given topic. After the story is processed, set topic_stories.spidered to true for that story. Calls get_links_from_story on each story. Arguments: db - db handle story - story dict from db topic - topic dict from db Returns: None """ log.info("mining %s %s for topic %s .." % (story['title'], story['url'], topic['name'])) links = get_links_from_story(db, story) for link in links: topic_link = { 'topics_id': topic['topics_id'], 'stories_id': story['stories_id'], 'url': link } db.create('topic_links', topic_link) db.query( "update topic_stories set link_mined = 't' where stories_id = %(a)s and topics_id = %(b)s", { 'a': story['stories_id'], 'b': topic['topics_id'] })
def _create_child_download_for_story(db: DatabaseHandler, story: dict, parent_download: dict) -> None: """Create a pending download for the story's URL.""" story = decode_object_from_bytes_if_needed(story) parent_download = decode_object_from_bytes_if_needed(parent_download) download = { 'feeds_id': parent_download['feeds_id'], 'stories_id': story['stories_id'], 'parent': parent_download['downloads_id'], 'url': story['url'], 'host': get_url_host(story['url']), 'type': 'content', 'sequence': 1, 'state': 'pending', 'priority': parent_download['priority'], 'extracted': False, } content_delay = db.query( """ SELECT content_delay FROM media WHERE media_id = %(media_id)s """, { 'media_id': story['media_id'] }).flat()[0] if content_delay: # Delay download of content this many hours. his is useful for sources that are likely to significantly change # content in the hours after it is first published. now = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) download_at_timestamp = now + (content_delay * 60 * 60) download['download_time'] = get_sql_date_from_epoch( download_at_timestamp) db.create(table='downloads', insert_hash=download)
def _add_topic_tweet_single_day( db: DatabaseHandler, topic: dict, day: datetime.datetime, ch_class: typing.Type[AbstractCrimsonHexagon]) -> dict: """ Add a row to topic_tweet_day if it does not already exist. fetch data for new row from CH. Arguments: db - database handle topic - topic dict day - date to fetch eg '2017-12-30' ch_class - AbstractCrimsonHexagon class Return: None """ # the perl-python layer was segfaulting until I added the str() around day below -hal topic_tweet_day = db.query( "select * from topic_tweet_days where topics_id = %(a)s and day = %(b)s", {'a': topic['topics_id'], 'b': str(day)}).hash() if topic_tweet_day is not None and topic_tweet_day['tweets_fetched']: raise McFetchTopicTweetDateFetchedException("tweets already fetched for day " + str(day)) # if we have a ttd but had not finished fetching tweets, delete it and start over if topic_tweet_day is not None: db.delete_by_id('topic_tweet_days', topic_tweet_day['topic_tweet_days_id']) ch_posts = ch_class.fetch_posts(topic['ch_monitor_id'], day) tweet_count = ch_posts['totalPostsAvailable'] num_ch_tweets = len(ch_posts['posts']) topic_tweet_day = db.create( 'topic_tweet_days', { 'topics_id': topic['topics_id'], 'day': day, 'tweet_count': tweet_count, 'num_ch_tweets': num_ch_tweets, 'tweets_fetched': False }) topic_tweet_day['ch_posts'] = ch_posts return topic_tweet_day
def create_test_topic(db: DatabaseHandler, label: str) -> dict: """Create test topic with a simple label.""" label = decode_object_from_bytes_if_needed(label) return db.create(table='topics', insert_hash={ 'name': label, 'description': label, 'pattern': label, 'solr_seed_query': label, 'solr_seed_query_run': True, 'start_date': '2016-01-01', 'end_date': '2016-03-01', 'job_queue': 'mc', 'max_stories': 100000, })
def _create_queued_job_state(db: DatabaseHandler, queue_name: str, args: Dict[str, Any]) -> Dict[str, Any]: """Create the initial entry in the "job_states" table with a state of 'queued' and return it.""" queue_name = decode_object_from_bytes_if_needed(queue_name) args = decode_object_from_bytes_if_needed(args) args_json = encode_json(args) state = db.create(table='job_states', insert_hash={ 'state': STATE_QUEUED, 'args': args_json, 'priority': 'normal', 'class': queue_name, 'process_id': os.getpid(), 'hostname': socket.gethostname(), }) return state
def _store_map(db: DatabaseHandler, topics_id: int, timespans_id: int, content: bytes, graph_format: str, color_by: str) -> None: """Create a timespans_map row.""" db.begin() options = {'color_by': color_by} options_json = encode_json(options) db.query( """ DELETE FROM timespan_maps WHERE timespans_id = %(a)s AND format = %(b)s AND options = %(c)s """, {'a': timespans_id, 'b': graph_format, 'c': options_json} ) timespan_map = { 'topics_id': topics_id, 'timespans_id': timespans_id, 'options': options_json, 'format': graph_format } timespan_map = db.create('timespan_maps', timespan_map) db.commit() content_types = { 'svg': 'image/svg+xml', 'gexf': 'xml/gexf' } content_type = content_types[graph_format] store_content(db, TIMESPAN_MAPS_TYPE, timespan_map['timespan_maps_id'], content, content_type) url = get_content_url(db, TIMESPAN_MAPS_TYPE, timespan_map['timespan_maps_id']) db.update_by_id('timespan_maps', timespan_map['timespan_maps_id'], {'url': url})
def create_download_for_new_story(db: DatabaseHandler, story: dict, feed: dict) -> dict: """Create and return download object in database for the new story.""" download = { 'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id'], 'url': story['url'], 'host': mediawords.util.url.get_url_host(story['url']), 'type': 'content', 'sequence': 1, 'state': 'success', 'path': 'content:pending', 'priority': 1, 'extracted': 'f' } download = db.create('downloads', download) return download
def _add_topic_tweet_single_day(db: DatabaseHandler, topic: dict, num_tweets: int, day: datetime.datetime) -> dict: """ Add a row to topic_tweet_day if it does not already exist. Arguments: db - database handle topic - topic dict day - date to fetch eg '2017-12-30' num_tweets - number of tweets found for that day Return: None """ # the perl-python layer was segfaulting until I added the str() around day below -hal topic_tweet_day = db.query( "select * from topic_tweet_days where topics_id = %(a)s and day = %(b)s", { 'a': topic['topics_id'], 'b': str(day) }).hash() if topic_tweet_day is not None and topic_tweet_day['tweets_fetched']: raise McFetchTopicTweetDateFetchedException( "tweets already fetched for day " + str(day)) # if we have a ttd but had not finished fetching tweets, delete it and start over if topic_tweet_day is not None: db.delete_by_id('topic_tweet_days', topic_tweet_day['topic_tweet_days_id']) topic_tweet_day = db.create( 'topic_tweet_days', { 'topics_id': topic['topics_id'], 'day': day, 'num_tweets': num_tweets, 'tweets_fetched': False }) return topic_tweet_day
def _store_tweet_and_urls(db: DatabaseHandler, topic_tweet_day: dict, ch_post: dict) -> None: """ Store the tweet in topic_tweets and its urls in topic_tweet_urls, using the data in ch_post. Arguments: db - database handler topic - topic dict topic_tweet_day - topic_tweet_day dict ch_post - ch_post dict Return: None """ data_json = mediawords.util.parse_json.encode_json(ch_post) # null characters are not legal in json but for some reason get stuck in these tweets data_json = data_json.replace('\x00', '') topic_tweet = { 'topic_tweet_days_id': topic_tweet_day['topic_tweet_days_id'], 'data': data_json, 'content': ch_post['tweet']['text'], 'tweet_id': ch_post['tweet_id'], 'publish_date': ch_post['tweet']['created_at'], 'twitter_user': ch_post['tweet']['user']['screen_name'] } topic_tweet = db.create('topic_tweets', topic_tweet) # for some reason I can't figure out, null characters still sneak through the data_json.replace() # above, so we have to tell postgres direclty to get rid of them, or else querying the row later # will fail db.query( """ update topic_tweets set data = regexp_replace(data::text, '\\u0000', '', 'g')::json where topic_tweets_id = %(a)s and data::text ~ '\\u0000' """, {'a': topic_tweet['topic_tweets_id']}) urls = mediawords.util.twitter.get_tweet_urls(ch_post['tweet']) _insert_tweet_urls(db, topic_tweet, urls)
def create_download_for_feed(db: DatabaseHandler, feed: dict) -> dict: feed = decode_object_from_bytes_if_needed(feed) priority = 0 if 'last_attempted_download_time' not in feed: priority = 10 host = get_url_host(url=feed['url']) return db.create(table='downloads', insert_hash={ 'feeds_id': int(feed['feeds_id']), 'url': feed['url'], 'host': host, 'type': 'feed', 'sequence': 1, 'state': 'pending', 'priority': priority, 'download_time': 'NOW()', 'extracted': False, })
def _store_post_and_urls(db: DatabaseHandler, topic_post_day: dict, post: dict) -> None: """ Store the tweet in topic_posts and its urls in topic_post_urls, using the data in post. Arguments: db - database handler topic - topic dict topic_post_day - topic_post_day dict post - post dict Return: None """ log.debug("remove nulls") _remove_json_tree_nulls(post) log.debug("encode json") data_json = encode_json(post) # null characters are not legal in json but for some reason get stuck in these tweets # data_json = data_json.replace('\x00', '') topic_post = { 'topic_post_days_id': topic_post_day['topic_post_days_id'], 'data': data_json } for field in POST_FIELDS: topic_post[field] = post.get(field, None) log.debug("insert topic post") topic_post = db.create('topic_posts', topic_post) urls = _get_post_urls(post) log.debug("insert tweet urls") _insert_post_urls(db, topic_post, urls) log.debug("done")
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict, new_medium: dict) -> dict: """Copy story to new medium. Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on. Return the new story. """ story = { 'url': old_story['url'], 'media_id': new_medium['media_id'], 'guid': old_story['guid'], 'publish_date': old_story['publish_date'], 'collect_date': sql_now(), 'description': old_story['description'], 'title': old_story['title'] } story = db.create('stories', story) add_to_topic_stories(db=db, story=story, topic=topic, valid_foreign_rss_story=True) db.query( """ insert into stories_tags_map (stories_id, tags_id) select %(a)s, stm.tags_id from stories_tags_map stm where stm.stories_id = %(b)s """, { 'a': story['stories_id'], 'b': old_story['stories_id'] }) feed = get_spider_feed(db, new_medium) db.create('feeds_stories_map', { 'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id'] }) old_download = db.query( "select * from downloads where stories_id = %(a)s order by downloads_id limit 1", { 'a': old_story['stories_id'] }).hash() download = create_download_for_new_story(db, story, feed) if old_download is not None: try: content = fetch_content(db, old_download) download = store_content(db, download, content) except (McDBIDownloadsException, McAmazonS3StoreException): download_update = dict([ (f, old_download[f]) for f in ['state', 'error_message', 'download_time'] ]) db.update_by_id('downloads', download['downloads_id'], download_update) db.query( """ insert into download_texts (downloads_id, download_text, download_text_length) select %(a)s, dt.download_text, dt.download_text_length from download_texts dt where dt.downloads_id = %(a)s """, {'a': download['downloads_id']}) # noinspection SqlInsertValues db.query( f""" insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date, language) select {int(story['stories_id'])} as stories_id, sentence_number, sentence, media_id, publish_date, language from story_sentences where stories_id = %(b)s """, {'b': old_story['stories_id']}) return story
def generate_story( db: DatabaseHandler, url: str, content: str, title: str = None, publish_date: datetime.datetime = None, fallback_date: typing.Optional[datetime.datetime] = None) -> dict: """Add a new story to the database by guessing metadata using the given url and content. This function guesses the medium, feed, title, and date of the story from the url and content. If inserting the story results in a unique constraint error based on media_id and url, return the existing story instead. Arguments: db - db handle url - story url content - story content fallback_date - fallback to this date if the date guesser fails to find a date """ if len(url) < 1: raise McTMStoriesException("url must not be an empty string") url = url[0:mediawords.dbi.stories.stories.MAX_URL_LENGTH] medium = mediawords.tm.media.guess_medium(db, url) feed = get_spider_feed(db, medium) spidered_tag = mediawords.tm.media.get_spidered_tag(db) if title is None: title = mediawords.util.parse_html.html_title(content, url, mediawords.dbi.stories.stories.MAX_TITLE_LENGTH) story = { 'url': url, 'guid': url, 'media_id': medium['media_id'], 'title': title, 'description': '' } # postgres refuses to insert text values with the null character for field in ('url', 'guid', 'title'): story[field] = re2.sub('\x00', '', story[field]) if publish_date is None: date_guess = guess_date(url, content) story['publish_date'] = date_guess.date if date_guess.found else fallback_date if story['publish_date'] is None: story['publish_date'] = datetime.datetime.now().isoformat() else: story['publish_date'] = publish_date try: story = db.create('stories', story) except mediawords.db.exceptions.handler.McUniqueConstraintException: return mediawords.tm.stories.get_story_match(db=db, url=story['url']) except Exception: raise McTMStoriesException("Error adding story: %s" % traceback.format_exc()) db.query( "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)", {'a': story['stories_id'], 'b': spidered_tag['tags_id']}) if publish_date is None: assign_date_guess_tag(db, story, date_guess, fallback_date) log.debug("add story: %s; %s; %s; %d" % (story['title'], story['url'], story['publish_date'], story['stories_id'])) db.create('feeds_stories_map', {'stories_id': story['stories_id'], 'feeds_id': feed['feeds_id']}) download = create_download_for_new_story(db, story, feed) mediawords.dbi.downloads.store_content(db, download, content) _extract_story(db, story) return story
def login_with_email_password(db: DatabaseHandler, email: str, password: str, ip_address: str = None) -> CurrentUser: """Log in with username and password; raise on unsuccessful login.""" email = decode_object_from_bytes_if_needed(email) password = decode_object_from_bytes_if_needed(password) if not (email and password): raise McAuthLoginException("Email and password must be defined.") # Try-except block because we don't want to reveal the specific reason why the login has failed try: user = user_info(db=db, email=email) # Check if user has tried to log in unsuccessfully before and now is trying # again too fast if __user_is_trying_to_login_too_soon(db=db, email=email): raise McAuthLoginException( "User '%s' is trying to log in too soon after the last unsuccessful attempt." % email ) if not password_hash_is_valid(password_hash=user.password_hash(), password=password): raise McAuthLoginException("Password for user '%s' is invalid." % email) except Exception as ex: log.info( "Login failed for %(email)s, will delay any successive login attempt for %(delay)d seconds: %(exc)s" % { 'email': email, 'delay': __POST_UNSUCCESSFUL_LOGIN_DELAY, 'exc': str(ex), } ) # Set the unsuccessful login timestamp # (TIMESTAMP 'now' returns "current transaction's start time", so using LOCALTIMESTAMP instead) db.query(""" UPDATE auth_users SET last_unsuccessful_login_attempt = LOCALTIMESTAMP WHERE email = %(email)s """, {'email': email}) # It might make sense to time.sleep() here for the duration of $POST_UNSUCCESSFUL_LOGIN_DELAY seconds to prevent # legitimate users from trying to log in too fast. However, when being actually brute-forced through multiple # HTTP connections, this approach might end up creating a lot of processes that would time.sleep() and take up # memory. # # So, let's return the error page ASAP and hope that a legitimate user won't be able to reenter his / her # password before the $POST_UNSUCCESSFUL_LOGIN_DELAY amount of seconds pass. # Don't give out a specific reason for the user to not be able to find # out which user emails are registered raise McAuthLoginException("User '%s' was not found or password is incorrect." % email) if not user.active(): raise McAuthLoginException("User with email '%s' is not active." % email) # Reset password reset token (if any) db.query(""" UPDATE auth_users SET password_reset_token_hash = NULL WHERE email = %(email)s AND password_reset_token_hash IS NOT NULL """, {'email': email}) if ip_address: if not user.api_key_for_ip_address(ip_address): db.create( table='auth_user_api_keys', insert_hash={ 'auth_users_id': user.user_id(), 'ip_address': ip_address, }) # Fetch user again user = user_info(db=db, email=email) if not user.api_key_for_ip_address(ip_address): raise McAuthLoginException("Unable to create per-IP API key for IP %s" % ip_address) return user
def extract_links_for_topic_story( db: DatabaseHandler, stories_id: int, topics_id: int, test_throw_exception: bool = False, ) -> None: """ Extract links from a story and insert them into the topic_links table for the given topic. After the story is processed, set topic_stories.spidered to true for that story. Calls _get_links_from_story() on each story. Almost all errors are caught by this function saved in topic_stories.link_mine_error. In the case of an error topic_stories.link_mined is also set to true. Arguments: db - db handle story - story dict from db topic - topic dict from db Returns: None """ story = db.require_by_id(table='stories', object_id=stories_id) topic = db.require_by_id(table='topics', object_id=topics_id) try: if test_throw_exception: raise McExtractLinksForTopicStoryTestException( "Testing whether errors get logged.") log.info("mining %s %s for topic %s .." % (story['title'], story['url'], topic['name'])) links = _get_links_from_story(db, story) for link in links: if skip_self_linked_domain_url(db, topic['topics_id'], story['url'], link): log.debug("skipping self linked domain url...") continue topic_link = { 'topics_id': topic['topics_id'], 'stories_id': story['stories_id'], 'url': link } db.create('topic_links', topic_link) increment_domain_links(db, topic_link) link_mine_error = '' except Exception as ex: log.error(f"Link mining error: {ex}") link_mine_error = traceback.format_exc() db.query( """ update topic_stories set link_mined = 't', link_mine_error = %(c)s where stories_id = %(a)s and topics_id = %(b)s """, { 'a': story['stories_id'], 'b': topic['topics_id'], 'c': link_mine_error })
def add_story(db: DatabaseHandler, story: dict, feeds_id: int, skip_checking_if_new: bool = False) -> Optional[dict]: """If the story is new, add story to the database with the feed of the download as story feed. Returns created story or None if story wasn't created. """ story = decode_object_from_bytes_if_needed(story) if isinstance(feeds_id, bytes): feeds_id = decode_object_from_bytes_if_needed(feeds_id) feeds_id = int(feeds_id) if isinstance(skip_checking_if_new, bytes): skip_checking_if_new = decode_object_from_bytes_if_needed( skip_checking_if_new) skip_checking_if_new = bool(int(skip_checking_if_new)) if db.in_transaction(): raise McAddStoryException( "add_story() can't be run from within transaction.") db.begin() db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE") if not skip_checking_if_new: if not is_new(db=db, story=story): log.debug("Story '{}' is not new.".format(story['url'])) db.commit() return None medium = db.find_by_id(table='media', object_id=story['media_id']) if story.get('full_text_rss', None) is None: story['full_text_rss'] = medium.get('full_text_rss', False) or False if len(story.get('description', '')) == 0: story['full_text_rss'] = False try: story = db.create(table='stories', insert_hash=story) except Exception as ex: db.rollback() # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint if 'unique constraint \"stories_guid' in str(ex): log.warning( "Failed to add story for '{}' to GUID conflict (guid = '{}')". format(story['url'], story['guid'])) return None else: raise McAddStoryException( "Error adding story: {}\nStory: {}".format( str(ex), str(story))) db.find_or_create(table='feeds_stories_map', insert_hash={ 'stories_id': story['stories_id'], 'feeds_id': feeds_id, }) db.commit() return story
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict, new_medium: dict) -> dict: """Copy story to new medium. Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on. Return the new story. """ story = { 'url': old_story['url'], 'media_id': new_medium['media_id'], 'guid': old_story['guid'], 'publish_date': old_story['publish_date'], 'collect_date': mediawords.util.sql.sql_now(), 'description': old_story['description'], 'title': old_story['title'] } story = db.create('stories', story) add_to_topic_stories(db=db, story=story, topic=topic, valid_foreign_rss_story=True) db.query( """ insert into stories_tags_map (stories_id, tags_id) select %(a)s, stm.tags_id from stories_tags_map stm where stm.stories_id = %(b)s """, {'a': story['stories_id'], 'b': old_story['stories_id']}) feed = get_spider_feed(db, new_medium) db.create('feeds_stories_map', {'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id']}) old_download = db.query( "select * from downloads where stories_id = %(a)s order by downloads_id limit 1", {'a': old_story['stories_id']}).hash() download = create_download_for_new_story(db, story, feed) if old_download is not None: try: content = mediawords.dbi.downloads.fetch_content(db, old_download) download = mediawords.dbi.downloads.store_content(db, download, content) except (mediawords.dbi.downloads.McDBIDownloadsException, mediawords.key_value_store.amazon_s3.McAmazonS3StoreException): download_update = dict([(f, old_download[f]) for f in ['state', 'error_message', 'download_time']]) db.update_by_id('downloads', download['downloads_id'], download_update) db.query( """ insert into download_texts (downloads_id, download_text, download_text_length) select %(a)s, dt.download_text, dt.download_text_length from download_texts dt where dt.downloads_id = %(a)s """, {'a': download['downloads_id']}) db.query( """ insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date, language) select %(a)s, sentence_number, sentence, media_id, publish_date, language from story_sentences where stories_id = %(b)s """, {'a': story['stories_id'], 'b': old_story['stories_id']}) return story
def generate_story( db: DatabaseHandler, url: str, content: str, fallback_date: typing.Optional[datetime.datetime] = None) -> dict: """Add a new story to the database by guessing metadata using the given url and content. This function guesses the medium, feed, title, and date of the story from the url and content. Arguments: db - db handle url - story url content - story content fallback_date - fallback to this date if the date guesser fails to find a date """ if len(url) < 1: raise McTMStoriesException("url must not be an empty string") url = url[0:_MAX_URL_LENGTH] medium = mediawords.tm.media.guess_medium(db, url) feed = get_spider_feed(db, medium) spidered_tag = mediawords.tm.media.get_spidered_tag(db) title = mediawords.util.parse_html.html_title(content, url, _MAX_TITLE_LENGTH) story = { 'url': url, 'guid': url, 'media_id': medium['media_id'], 'title': title, 'description': '' } # postgres refuses to insert text values with the null character for field in ('url', 'guid', 'title'): story[field] = re2.sub('\x00', '', story[field]) date_guess = guess_date(url, content) story[ 'publish_date'] = date_guess.date if date_guess.found else fallback_date if story['publish_date'] is None: story['publish_date'] = datetime.datetime.now().isoformat() try: story = db.create('stories', story) except mediawords.db.exceptions.handler.McUniqueConstraintException: raise McTMStoriesDuplicateException( "Attempt to insert duplicate story url %s" % url) except Exception: raise McTMStoriesException("Error adding story: %s" % traceback.format_exc()) db.query( "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)", { 'a': story['stories_id'], 'b': spidered_tag['tags_id'] }) assign_date_guess_tag(db, story, date_guess, fallback_date) log.debug("add story: %s; %s; %s; %d" % (story['title'], story['url'], story['publish_date'], story['stories_id'])) db.create('feeds_stories_map', { 'stories_id': story['stories_id'], 'feeds_id': feed['feeds_id'] }) download = create_download_for_new_story(db, story, feed) mediawords.dbi.downloads.store_content(db, download, content) _extract_story(db, story) return story
def add_user(db: DatabaseHandler, new_user: NewUser) -> None: """Add new user.""" if not new_user: raise McAuthRegisterException("New user is undefined.") # Check if user already exists user_exists = db.query(""" SELECT auth_users_id FROM auth_users WHERE email = %(email)s LIMIT 1 """, {'email': new_user.email()}).hash() if user_exists is not None and 'auth_users_id' in user_exists: raise McAuthRegisterException("User with email '%s' already exists." % new_user.email()) # Hash + validate the password try: password_hash = generate_secure_hash(password=new_user.password()) if not password_hash: raise McAuthRegisterException("Password hash is empty.") except Exception as ex: log.error("Unable to hash a new password: {}".format(ex)) raise McAuthRegisterException('Unable to hash a new password.') db.begin() # Create the user db.create( table='auth_users', insert_hash={ 'email': new_user.email(), 'password_hash': password_hash, 'full_name': new_user.full_name(), 'notes': new_user.notes(), 'active': bool(int(new_user.active())), } ) # Fetch the user's ID try: user = user_info(db=db, email=new_user.email()) except Exception as ex: db.rollback() raise McAuthRegisterException("I've attempted to create the user but it doesn't exist: %s" % str(ex)) # Create roles try: for auth_roles_id in new_user.role_ids(): db.create(table='auth_users_roles_map', insert_hash={ 'auth_users_id': user.user_id(), 'auth_roles_id': auth_roles_id, }) except Exception as ex: raise McAuthRegisterException("Unable to create roles: %s" % str(ex)) # Update limits (if they're defined) if new_user.weekly_requests_limit() is not None: db.query(""" UPDATE auth_user_limits SET weekly_requests_limit = %(weekly_requests_limit)s WHERE auth_users_id = %(auth_users_id)s """, { 'auth_users_id': user.user_id(), 'weekly_requests_limit': new_user.weekly_requests_limit(), }) if new_user.weekly_requested_items_limit() is not None: db.query(""" UPDATE auth_user_limits SET weekly_requested_items_limit = %(weekly_requested_items_limit)s WHERE auth_users_id = %(auth_users_id)s """, { 'auth_users_id': user.user_id(), 'weekly_requested_items_limit': new_user.weekly_requested_items_limit(), }) # Subscribe to newsletter if new_user.subscribe_to_newsletter(): db.create(table='auth_users_subscribe_to_newsletter', insert_hash={'auth_users_id': user.user_id()}) if not new_user.active(): send_user_activation_token( db=db, email=new_user.email(), activation_link=new_user.activation_url(), subscribe_to_newsletter=new_user.subscribe_to_newsletter(), ) db.commit()