コード例 #1
0
ファイル: stories.py プロジェクト: vishalbelsare/mediacloud
def _add_missing_normalized_title_hashes(db: DatabaseHandler, topic: dict) -> None:
    """Add a normalized_title_hash field for every stories row that is missing it for the given topic."""
    db.begin()
    db.query(
        """
        DECLARE c CURSOR FOR
            SELECT stories_id
            FROM snap.live_stories
            WHERE
                topics_id = %(topics_id)s AND
                normalized_title_hash IS NULL
        """,
        {
            'topics_id': topic['topics_id']
        }
    )

    log.info('adding normalized story titles ...')

    # break this up into chunks instead of doing all topic stories at once via a simple sql query because we don't
    # want to do a single giant transaction with millions of stories
    while True:
        stories_ids = db.query("fetch 100 from c").flat()
        if len(stories_ids) < 1:
            break

        db.query("""
            UPDATE stories
            SET normalized_title_hash = md5(get_normalized_title(title, media_id))::UUID
            WHERE stories_id = ANY(%(story_ids)s)
        """, {
            'story_ids': stories_ids,
        })

    db.commit()
コード例 #2
0
ファイル: stories.py プロジェクト: berkmancenter/mediacloud
def add_story(db: DatabaseHandler, story: dict, feeds_id: int, skip_checking_if_new: bool = False) -> Optional[dict]:
    """If the story is new, add story to the database with the feed of the download as story feed.

    Returns created story or None if story wasn't created.
    """

    story = decode_object_from_bytes_if_needed(story)
    if isinstance(feeds_id, bytes):
        feeds_id = decode_object_from_bytes_if_needed(feeds_id)
    feeds_id = int(feeds_id)
    if isinstance(skip_checking_if_new, bytes):
        skip_checking_if_new = decode_object_from_bytes_if_needed(skip_checking_if_new)
    skip_checking_if_new = bool(int(skip_checking_if_new))

    if db.in_transaction():
        raise McAddStoryException("add_story() can't be run from within transaction.")

    db.begin()

    db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE")

    if not skip_checking_if_new:
        if not is_new(db=db, story=story):
            log.debug("Story '{}' is not new.".format(story['url']))
            db.commit()
            return None

    medium = db.find_by_id(table='media', object_id=story['media_id'])

    if story.get('full_text_rss', None) is None:
        story['full_text_rss'] = medium.get('full_text_rss', False) or False
        if len(story.get('description', '')) == 0:
            story['full_text_rss'] = False

    try:
        story = db.create(table='stories', insert_hash=story)
    except Exception as ex:
        db.rollback()

        # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint
        if 'unique constraint \"stories_guid' in str(ex):
            log.warning(
                "Failed to add story for '{}' to GUID conflict (guid = '{}')".format(story['url'], story['guid'])
            )
            return None

        else:
            raise McAddStoryException("Error adding story: {}\nStory: {}".format(str(ex), str(story)))

    db.find_or_create(
        table='feeds_stories_map',
        insert_hash={
            'stories_id': story['stories_id'],
            'feeds_id': feeds_id,
        }
    )

    db.commit()

    return story
コード例 #3
0
def _add_missing_normalized_title_hashes(db: DatabaseHandler,
                                         topic: dict) -> None:
    """Add a normalized_title_hash field for every stories row that is missing it for the given topic."""
    db.begin()
    db.query(
        """
        declare c cursor for
            select stories_id from snap.live_stories where topics_id = %(a)s and normalized_title_hash is null
        """, {'a': topic['topics_id']})

    log.info('adding normalized story titles ...')

    # break this up into chunks instead of doing all topic stories at once via a simple sql query because we don't
    # want to do a single giant transaction with millions of stories
    while True:
        stories_ids = db.query("fetch 100 from c").flat()
        if len(stories_ids) < 1:
            break

        db.query(
            """
            update stories
            set normalized_title_hash = md5(get_normalized_title(title, media_id))::uuid
            where stories_id = any( %(a)s )
        """, {'a': stories_ids})

    db.commit()
コード例 #4
0
def _fetch_tweets_for_day(db: DatabaseHandler,
                          twitter_class: typing.Type[AbstractTwitter],
                          topic: dict,
                          topic_tweet_day: dict,
                          max_tweets: typing.Optional[int] = None) -> None:
    """
    Fetch tweets for a single day.

    If tweets_fetched is false for the given topic_tweet_days row, fetch the tweets for the given day by querying
    the list of tweets from CH and then fetching each tweet from twitter.

    Arguments:
    db - db handle
    twitter_class - AbstractTwitter class
    topic - topic dict
    topic_tweet_day - topic_tweet_day dict
    max_tweets - max tweets to fetch for a single day

    Return:
    None
    """
    if topic_tweet_day['tweets_fetched']:
        return

    ch_posts_data = topic_tweet_day['ch_posts']

    ch_posts = ch_posts_data['posts']

    if (max_tweets is not None):
        ch_posts = ch_posts[0:max_tweets]

    log.debug("adding %d tweets for topic %s, day %s" %
              (len(ch_posts), topic['topics_id'], topic_tweet_day['day']))

    # we can only get 100 posts at a time from twitter
    for i in range(0, len(ch_posts), 100):
        _add_tweets_to_ch_posts(twitter_class, ch_posts[i:i + 100])

    db.begin()

    log.debug("inserting into topic_tweets ...")

    for ch_post in ch_posts:
        if 'tweet' in ch_post:
            _store_tweet_and_urls(db, topic_tweet_day, ch_post)

    num_deleted_tweets = len(list(filter(lambda x: 'tweet' not in x,
                                         ch_posts)))
    topic_tweet_day['num_ch_tweets'] -= num_deleted_tweets

    db.query(
        "update topic_tweet_days set tweets_fetched = true, num_ch_tweets = %(a)s where topic_tweet_days_id = %(b)s",
        {
            'a': topic_tweet_day['num_ch_tweets'],
            'b': topic_tweet_day['topic_tweet_days_id']
        })

    db.commit()

    log.debug("done inserting into topic_tweets")
コード例 #5
0
ファイル: stories.py プロジェクト: berkmancenter/mediacloud
def extract_and_process_story(db: DatabaseHandler,
                              story: dict,
                              extractor_args: PyExtractorArguments = PyExtractorArguments()) -> None:
    """Extract all of the downloads for the given story and then call process_extracted_story()."""

    story = decode_object_from_bytes_if_needed(story)

    stories_id = story['stories_id']

    use_transaction = not db.in_transaction()
    if use_transaction:
        db.begin()

    log.debug("Fetching downloads for story {}...".format(stories_id))
    downloads = db.query("""
        SELECT *
        FROM downloads
        WHERE stories_id = %(stories_id)s
          AND type = 'content'
        ORDER BY downloads_id ASC
    """, {'stories_id': stories_id}).hashes()

    # MC_REWRITE_TO_PYTHON: Perlism
    if downloads is None:
        downloads = []

    for download in downloads:
        log.debug("Extracting download {} for story {}...".format(download['downloads_id'], stories_id))
        extract_and_create_download_text(db=db, download=download, extractor_args=extractor_args)

    log.debug("Processing extracted story {}...".format(stories_id))
    process_extracted_story(db=db, story=story, extractor_args=extractor_args)

    if use_transaction:
        db.commit()
コード例 #6
0
def import_feed_downloads(db: DatabaseHandler, csv_file: str) -> None:
    log.info(f"Importing downloads from {csv_file}...")

    db.begin()

    with open(csv_file, mode='r', encoding='utf-8') as f:

        # Guess dialect
        sample = f.read(1024)
        sniffer = csv.Sniffer()
        dialect = sniffer.sniff(sample)
        f.seek(0)

        input_csv = csv.DictReader(f, dialect=dialect)

        n = 1
        for download in input_csv:
            log.info(f"Importing download {n}...")
            n += 1

            raw_download_content = download.get('_raw_download_content', None)
            if raw_download_content:
                del raw_download_content['_raw_download_content']

                # Cast some columns
                download['feeds_id'] = int(
                    download.get['feeds_id']
                ) if 'feeds_id' in download else None  # NULL
                download['stories_id'] = int(
                    download.get['stories_id']
                ) if 'stories_id' in download else None  # NULL
                download['parent'] = int(
                    download.get['parent']
                ) if 'parent' in download else None  # NULL
                download['priority'] = int(
                    download.get['priority']
                ) if 'priority' in download else 0  # NOT NULL
                download['sequence'] = int(
                    download.get['sequence']
                ) if 'sequence' in download else 0  # NOT NULL
                download['sequence'] = 't' if download.get('extracted',
                                                           False) else 'f'

                # Will be rewritten by handle_download()
                download['path'] = ''

                download = db.create(table='downloads', insert_hash=download)

                # Create mock response to import it
                response = FakeResponse(content=raw_download_content)
                handler = handler_for_download(db=db, download=download)
                handler.store_response(db=db,
                                       download=download,
                                       response=response)

    log.info("Committing...")
    db.commit()

    log.info(f"Done importing downloads from {csv_file}")
コード例 #7
0
def _fetch_tweets_for_day(
        db: DatabaseHandler,
        twitter_class: typing.Type[AbstractTwitter],
        topic: dict,
        topic_tweet_day: dict,
        max_tweets: typing.Optional[int] = None) -> None:
    """
    Fetch tweets for a single day.

    If tweets_fetched is false for the given topic_tweet_days row, fetch the tweets for the given day by querying
    the list of tweets from CH and then fetching each tweet from twitter.

    Arguments:
    db - db handle
    twitter_class - AbstractTwitter class
    topic - topic dict
    topic_tweet_day - topic_tweet_day dict
    max_tweets - max tweets to fetch for a single day

    Return:
    None
    """
    if topic_tweet_day['tweets_fetched']:
        return

    ch_posts_data = topic_tweet_day['ch_posts']

    ch_posts = ch_posts_data['posts']

    if (max_tweets is not None):
        ch_posts = ch_posts[0:max_tweets]

    log.info("adding %d tweets for topic %s, day %s" % (len(ch_posts), topic['topics_id'], topic_tweet_day['day']))

    # we can only get 100 posts at a time from twitter
    for i in range(0, len(ch_posts), 100):
        _add_tweets_to_ch_posts(twitter_class, ch_posts[i:i + 100])

    ch_posts = list(filter(lambda p: _post_matches_pattern(topic, p), ch_posts))

    log.info("%d tweets remaining after match" % (len(ch_posts)))

    db.begin()

    log.debug("inserting into topic_tweets ...")

    [_store_tweet_and_urls(db, topic_tweet_day, ch_post) for ch_post in ch_posts]

    topic_tweet_day['num_ch_tweets'] = len(ch_posts)

    db.query(
        "update topic_tweet_days set tweets_fetched = true, num_ch_tweets = %(a)s where topic_tweet_days_id = %(b)s",
        {'a': topic_tweet_day['num_ch_tweets'], 'b': topic_tweet_day['topic_tweet_days_id']})

    db.commit()

    log.debug("done inserting into topic_tweets")
コード例 #8
0
def _fetch_tweets_for_day(db: DatabaseHandler,
                          topic_tweet_day: dict,
                          meta_tweets: list,
                          max_tweets: typing.Optional[int] = None) -> None:
    """
    Fetch tweets for a single day.

    If tweets_fetched is false for the given topic_tweet_days row, fetch the tweets for the given day by querying
    the list of tweets and then fetching each tweet from twitter.

    Arguments:
    db - db handle
    topic_tweet_day - topic_tweet_day dict
    meta_tweets - list of meta tweets found for day
    max_tweets - max tweets to fetch for a single day

    Return:
    None
    """
    if (max_tweets is not None):
        meta_tweets = meta_tweets[0:max_tweets]

    topics_id = topic_tweet_day['topics_id']
    log.info("adding %d tweets for topic %s, day %s" %
             (len(meta_tweets), topics_id, topic_tweet_day['day']))

    # we can only get 100 posts at a time from twitter
    for i in range(0, len(meta_tweets), 100):
        _add_tweets_to_meta_tweets(meta_tweets[i:i + 100])

    topic = db.require_by_id('topics', topic_tweet_day['topics_id'])
    meta_tweets = list(
        filter(lambda p: _tweet_matches_pattern(topic, p), meta_tweets))

    log.info("%d tweets remaining after match" % (len(meta_tweets)))

    db.begin()

    log.debug("inserting into topic_tweets ...")

    [
        _store_tweet_and_urls(db, topic_tweet_day, meta_tweet)
        for meta_tweet in meta_tweets
    ]

    topic_tweet_day['num_tweets'] = len(meta_tweets)

    db.query(
        "update topic_tweet_days set tweets_fetched = true, num_tweets = %(a)s where topic_tweet_days_id = %(b)s",
        {
            'a': topic_tweet_day['num_tweets'],
            'b': topic_tweet_day['topic_tweet_days_id']
        })

    db.commit()

    log.debug("done inserting into topic_tweets")
コード例 #9
0
def print_long_running_job_states(db: DatabaseHandler, limit: int):
    media = db.query("""
        select m.*, mh.*
        from media m
            join media_health mh using ( media_id ) 
        where dup_media_id is null
        order by m.media_id asc limit %(a)s
    """, {'a': limit}).hashes()

    media_groups = {}

    num_media = len(media)
    for i, medium in enumerate(media):
        domain = get_url_distinctive_domain(medium['url'])
        log.warning("%s [%d/%d]" % (domain, i, num_media))

        if domain not in media_groups:
            media_groups[domain] = []

        media_groups[domain].append(medium)

        medium['medium_domain'] = domain
        medium['dup_domain_matches'] = True

        dup_media = db.query(
            "select m.*, mh.* from media m join media_health mh using ( media_id ) where dup_media_id = %(a)s",
            {'a': medium['media_id']}
        ).hashes()

        media_groups[domain].extend(dup_media)

        for dup_medium in dup_media:
            dup_domain = get_url_distinctive_domain(dup_medium['url'])
            medium['medium_domain'] = dup_domain
            medium['dup_domain_matches'] = domain == dup_domain

    db.query("DROP TABLE IF EXISTS media_dups")
    db.query(
        """
        CREATE TABLE media_dups (
            domain TEXT,
            media_id BIGINT
            )
        """)

    db.begin()
    for i, domain in enumerate(media_groups.keys()):
        log.warning("domain %s [%d/%d]" % (domain, i, len(media_groups.keys())))
        media = media_groups[domain]
        if len(media) > 1:
            for m in media:
                db.query("""
                    insert into media_dups (domain, media_id) values (%(a)s, %(b)s)
                """, {'a': domain, 'b': m['media_id']})
    db.commit()
コード例 #10
0
def regenerate_api_key(db: DatabaseHandler, email: str) -> None:
    """Regenerate API key -- creates new non-IP limited API key, removes all IP-limited API keys."""

    email = decode_object_from_bytes_if_needed(email)

    if not email:
        raise McAuthProfileException('Email address is empty.')

    # Check if user exists
    try:
        user = user_info(db=db, email=email)
    except Exception as _:
        raise McAuthProfileException(
            "User with email address '%s' does not exist." % email)

    db.begin()

    # Purge all IP-limited API keys
    db.query(
        """
        DELETE FROM auth_user_api_keys
        WHERE ip_address IS NOT NULL
          AND auth_users_id = (
            SELECT auth_users_id
            FROM auth_users
            WHERE email = %(email)s
          )
    """, {'email': email})

    # Regenerate non-IP limited API key
    db.query(
        """
        UPDATE auth_user_api_keys

        -- DEFAULT points to a generation function
        SET api_key = DEFAULT

        WHERE ip_address IS NULL
          AND auth_users_id = (
            SELECT auth_users_id
            FROM auth_users
            WHERE email = %(email)s
          )
    """, {'email': email})

    message = AuthAPIKeyResetMessage(to=email, full_name=user.full_name())
    if not send_email(message):
        db.rollback()
        raise McAuthProfileException(
            "Unable to send email about reset API key.")

    db.commit()
コード例 #11
0
def _store_posts_for_day(db: DatabaseHandler, topic_post_day: dict, posts: list) -> None:
    """
    Store posts for a single day.

    Arguments:
    db - db handle
    topic_post_day - topic_post_day dict
    posts - list of posts found for day

    Return:
    None
    """
    log.info("adding %d posts for day %s" % (len(posts), topic_post_day['day']))

    tsq = db.require_by_id('topic_seed_queries', topic_post_day['topic_seed_queries_id'])
    topic = db.require_by_id('topics', tsq['topics_id'])
    posts = list(filter(lambda p: content_matches_topic(p['content'], topic), posts))

    num_posts_fetched = len(posts)

    log.info(f"{num_posts_fetched} posts remaining after match")

    db.begin()

    db.query("SET LOCAL citus.multi_shard_modify_mode TO 'sequential'")

    log.debug("inserting into topic_posts ...")

    [_store_post_and_urls(db, topic_post_day, meta_tweet) for meta_tweet in posts]

    db.query(
        """
        UPDATE topic_post_days SET
            posts_fetched = true,
            num_posts_stored = %(num_posts_stored)s,
            num_posts_fetched = %(num_posts_fetched)s
        WHERE
            topics_id = %(topics_id)s AND
            topic_post_days_id = %(topic_post_days_id)s
        """,
        {
            'num_posts_stored': len(posts),
            'num_posts_fetched': num_posts_fetched,
            'topics_id': topic_post_day['topics_id'],
            'topic_post_days_id': topic_post_day['topic_post_days_id'],
        }
    )

    db.commit()

    log.debug("done inserting into topic_posts")
コード例 #12
0
ファイル: stories.py プロジェクト: rleir/mediacloud
def merge_foreign_rss_stories(db: DatabaseHandler, topic: dict) -> None:
    """Move all topic stories with a foreign_rss_links medium from topic_stories back to topic_seed_urls."""
    topic = decode_object_from_bytes_if_needed(topic)

    stories = db.query(
        """
        select s.*
        from stories s, topic_stories ts, media m
        where s.stories_id = ts.stories_id
          and s.media_id = m.media_id
          and m.foreign_rss_links = true
          and ts.topics_id = %(a)s
          and not ts.valid_foreign_rss_story
        """, {
            'a': topic['topics_id']
        }).hashes()

    for story in stories:
        download = db.query(
            "select * from downloads where stories_id = %(a)s order by downloads_id limit 1",
            {
                'a': story['stories_id']
            }).hash()

        content = ''
        try:
            content = fetch_content(db, download)
        except Exception as ex:
            log.warning(
                f"Unable to fetch content for download {download['downloads_id']}: {ex}"
            )

        # postgres will complain if the content has a null in it
        content = content.replace('\x00', '')

        db.begin()
        db.create(
            'topic_seed_urls', {
                'url': story['url'],
                'topics_id': topic['topics_id'],
                'source': 'merge_foreign_rss_stories',
                'content': content
            })

        db.query(
            "delete from topic_stories where stories_id = %(a)s and topics_id = %(b)s",
            {
                'a': story['stories_id'],
                'b': topic['topics_id']
            })
        db.commit()
コード例 #13
0
def regenerate_api_key(db: DatabaseHandler, email: str) -> None:
    """Regenerate API key -- creates new non-IP limited API key, removes all IP-limited API keys."""

    email = decode_object_from_bytes_if_needed(email)

    if not email:
        raise McAuthProfileException('Email address is empty.')

    # Check if user exists
    try:
        user = user_info(db=db, email=email)
    except Exception:
        raise McAuthProfileException(
            "User with email address '%s' does not exist." % email)

    db.begin()

    # Purge all API keys
    db.query(
        """
        DELETE FROM auth_user_api_keys
        WHERE auth_users_id = %(auth_users_id)s
    """, {'auth_users_id': user.user_id()})

    # Regenerate non-IP limited API key
    db.query(
        """
        INSERT INTO auth_user_api_keys (
            auth_users_id,
            api_key,
            ip_address
        )
        VALUES (
            %(auth_users_id)s,

            -- DEFAULT points to a generation function
            DEFAULT,

            NULL
        )
    """, {'auth_users_id': user.user_id()})

    message = AuthAPIKeyResetMessage(to=email, full_name=user.full_name())
    if not send_email(message):
        db.rollback()
        raise McAuthProfileException(
            "Unable to send email about reset API key.")

    db.commit()
コード例 #14
0
def _store_posts_for_day(db: DatabaseHandler, topic_post_day: dict,
                         posts: list) -> None:
    """
    Store posts for a single day.

    Arguments:
    db - db handle
    topic_post_day - topic_post_day dict
    posts - list of posts found for day

    Return:
    None
    """
    log.info("adding %d posts for day %s" %
             (len(posts), topic_post_day['day']))

    tsq = db.require_by_id('topic_seed_queries',
                           topic_post_day['topic_seed_queries_id'])
    topic = db.require_by_id('topics', tsq['topics_id'])
    posts = list(
        filter(lambda p: content_matches_topic(p['content'], topic), posts))

    num_posts_fetched = len(posts)

    log.info(f"{num_posts_fetched} posts remaining after match")

    db.begin()

    log.debug("inserting into topic_posts ...")

    [
        _store_post_and_urls(db, topic_post_day, meta_tweet)
        for meta_tweet in posts
    ]

    db.query(
        """
        update topic_post_days set posts_fetched = true, num_posts_stored = %(a)s, num_posts_fetched = %(b)s
            where topic_post_days_id = %(c)s
        """, {
            'a': len(posts),
            'b': num_posts_fetched,
            'c': topic_post_day['topic_post_days_id']
        })

    db.commit()

    log.debug("done inserting into topic_posts")
コード例 #15
0
ファイル: profile.py プロジェクト: berkmancenter/mediacloud
def regenerate_api_key(db: DatabaseHandler, email: str) -> None:
    """Regenerate API key -- creates new non-IP limited API key, removes all IP-limited API keys."""

    email = decode_object_from_bytes_if_needed(email)

    if not email:
        raise McAuthProfileException('Email address is empty.')

    # Check if user exists
    try:
        user = user_info(db=db, email=email)
    except Exception:
        raise McAuthProfileException("User with email address '%s' does not exist." % email)

    db.begin()

    # Purge all IP-limited API keys
    db.query("""
        DELETE FROM auth_user_api_keys
        WHERE ip_address IS NOT NULL
          AND auth_users_id = (
            SELECT auth_users_id
            FROM auth_users
            WHERE email = %(email)s
          )
    """, {'email': email})

    # Regenerate non-IP limited API key
    db.query("""
        UPDATE auth_user_api_keys

        -- DEFAULT points to a generation function
        SET api_key = DEFAULT

        WHERE ip_address IS NULL
          AND auth_users_id = (
            SELECT auth_users_id
            FROM auth_users
            WHERE email = %(email)s
          )
    """, {'email': email})

    message = AuthAPIKeyResetMessage(to=email, full_name=user.full_name())
    if not send_email(message):
        db.rollback()
        raise McAuthProfileException("Unable to send email about reset API key.")

    db.commit()
コード例 #16
0
ファイル: solr.py プロジェクト: vishalbelsare/mediacloud
def queue_all_stories(db: DatabaseHandler) -> None:
    db.begin()

    db.query("TRUNCATE TABLE solr_import_stories")

    # "SELECT FROM processed_stories" because only processed stories should get imported. "ORDER BY" so that the
    # import is more efficient when pulling blocks of stories out.
    db.query("""
        INSERT INTO solr_import_stories (stories_id)
            SELECT stories_id
            FROM processed_stories
            GROUP BY stories_id
            ORDER BY stories_id
    """)

    db.commit()
コード例 #17
0
ファイル: stories.py プロジェクト: tidehc/mediacloud
def merge_foreign_rss_stories(db: DatabaseHandler, topic: dict) -> None:
    """Move all topic stories with a foreign_rss_links medium from topic_stories back to topic_seed_urls."""
    topic = decode_object_from_bytes_if_needed(topic)

    stories = db.query(
        """
        select s.*
            from stories s, topic_stories ts, media m
            where
                s.stories_id = ts.stories_id and
                s.media_id = m.media_id and
                m.foreign_rss_links = true and
                ts.topics_id = %(a)s and
                not ts.valid_foreign_rss_story
        """, {
            'a': topic['topics_id']
        }).hashes()

    for story in stories:
        download = db.query(
            "select * from downloads where stories_id = %(a)s order by downloads_id limit 1",
            {
                'a': story['stories_id']
            }).hash()

        content = ''
        try:
            content = mediawords.dbi.downloads.fetch_content(db, download)
        except Exception:
            pass

        db.begin()
        db.create(
            'topic_seed_urls', {
                'url': story['url'],
                'topics_id': topic['topics_id'],
                'source': 'merge_foreign_rss_stories',
                'content': content
            })

        db.query(
            "delete from topic_stories where stories_id = %(a)s and topics_id = %(b)s",
            {
                'a': story['stories_id'],
                'b': topic['topics_id']
            })
        db.commit()
コード例 #18
0
def _update_media_normalized_urls(db: DatabaseHandler) -> None:
    """Keep normalized_url field in media table up to date.

    Set the normalized_url field of any row in media for which it is null.  Take care to lock the process
    so that only one process is doing this work at a time.
    """
    # put a lock on this because the process of generating all media urls will take a couple hours, and we don't
    # want all workers to do the work
    locked = False
    while not locked:
        if not _normalized_urls_out_of_date(db):
            return

        db.begin()

        # poll instead of block so that we can releae the transaction and see whether someone else has already
        # updated all of the media
        locked = get_session_lock(
            db, 'MediaWords::TM::Media::media_normalized_urls', 1, wait=False)

        if not locked:
            db.commit()
            log.info("sleeping for media_normalized_urls lock...")
            time.sleep(1)

    log.warning("updating media_normalized_urls ...")

    media = db.query(
        "select * from media where normalized_url is null").hashes()

    i = 0
    total = len(media)
    for medium in media:
        i += 1
        normalized_url = mediawords.util.url.normalize_url_lossy(medium['url'])
        if normalized_url is None:
            normalized_url = medium['url']

        log.info("[%d/%d] adding %s (%s)" %
                 (i, total, medium['name'], normalized_url))

        db.update_by_id('media', medium['media_id'],
                        {'normalized_url': normalized_url})

    db.commit()
コード例 #19
0
def queue_all_stories(db: DatabaseHandler, stories_queue_table: str = 'solr_import_stories') -> None:
    stories_queue_table = decode_object_from_bytes_if_needed(stories_queue_table)

    db.begin()

    db.query(f"TRUNCATE TABLE {stories_queue_table}")

    # "SELECT FROM processed_stories" because only processed stories should get imported. "ORDER BY" so that the
    # import is more efficient when pulling blocks of stories out.
    db.query(f"""
        INSERT INTO {stories_queue_table}
            SELECT stories_id
            FROM processed_stories
            GROUP BY stories_id
            ORDER BY stories_id
    """)

    db.commit()
コード例 #20
0
def extract_and_process_story(
    db: DatabaseHandler,
    story: dict,
    extractor_args: PyExtractorArguments = PyExtractorArguments()
) -> None:
    """Extract all of the downloads for the given story and then call process_extracted_story()."""

    story = decode_object_from_bytes_if_needed(story)

    stories_id = story['stories_id']

    use_transaction = not db.in_transaction()
    if use_transaction:
        db.begin()

    log.debug("Fetching downloads for story {}...".format(stories_id))
    downloads = db.query(
        """
        SELECT *
        FROM downloads
        WHERE stories_id = %(stories_id)s
          AND type = 'content'
          AND state = 'success'
        ORDER BY downloads_id ASC
    """, {
            'stories_id': stories_id
        }).hashes()

    # MC_REWRITE_TO_PYTHON: Perlism
    if downloads is None:
        downloads = []

    for download in downloads:
        log.debug("Extracting download {} for story {}...".format(
            download['downloads_id'], stories_id))
        extract_and_create_download_text(db=db,
                                         download=download,
                                         extractor_args=extractor_args)

    log.debug("Processing extracted story {}...".format(stories_id))
    process_extracted_story(db=db, story=story, extractor_args=extractor_args)

    if use_transaction:
        db.commit()
コード例 #21
0
def activate_user_via_token(db: DatabaseHandler, email: str,
                            activation_token: str) -> None:
    """Change password with a password token sent by email."""

    email = decode_object_from_bytes_if_needed(email)
    activation_token = decode_object_from_bytes_if_needed(activation_token)

    if not email:
        raise McAuthRegisterException("Email is empty.")
    if not activation_token:
        raise McAuthRegisterException('Password reset token is empty.')

    # Validate the token once more (was pre-validated in controller)
    if not password_reset_token_is_valid(
            db=db, email=email, password_reset_token=activation_token):
        raise McAuthRegisterException('Activation token is invalid.')

    db.begin()

    # Set the password hash
    db.query(
        """
        UPDATE auth_users
        SET active = TRUE
        WHERE email = %(email)s
    """, {'email': email})

    # Unset the password reset token
    db.query(
        """
        UPDATE auth_users
        SET password_reset_token_hash = NULL
        WHERE email = %(email)s
    """, {'email': email})

    user = user_info(db=db, email=email)

    message = AuthActivatedMessage(to=email, full_name=user.full_name())
    if not send_email(message):
        db.rollback()
        raise McAuthRegisterException(
            "Unable to send email about an activated user.")

    db.commit()
コード例 #22
0
ファイル: map.py プロジェクト: vishalbelsare/mediacloud
def _store_map(db: DatabaseHandler,
        topics_id: int,
        timespans_id: int,
        content: bytes,
        graph_format: str,
        color_by: str) -> None:
    """Create a timespans_map row."""
    db.begin()

    options = {'color_by': color_by}
    options_json = encode_json(options)

    db.query(
        """
            DELETE FROM timespan_maps
            WHERE timespans_id = %(a)s
              AND format = %(b)s
              AND options = %(c)s
        """,
        {'a': timespans_id, 'b': graph_format, 'c': options_json}
    )

    timespan_map = {
        'topics_id': topics_id,
        'timespans_id': timespans_id,
        'options': options_json,
        'format': graph_format
    }
    timespan_map = db.create('timespan_maps', timespan_map)

    db.commit()

    content_types = {
        'svg': 'image/svg+xml',
        'gexf': 'xml/gexf'
    }
    content_type = content_types[graph_format]

    store_content(db, TIMESPAN_MAPS_TYPE, timespan_map['timespan_maps_id'], content, content_type)

    url = get_content_url(db, TIMESPAN_MAPS_TYPE, timespan_map['timespan_maps_id'])

    db.update_by_id('timespan_maps', timespan_map['timespan_maps_id'], {'url': url})
コード例 #23
0
ファイル: profile.py プロジェクト: berkmancenter/mediacloud
def all_users(db: DatabaseHandler) -> List[CurrentUser]:
    """Fetch and return a list of users and their roles."""

    # Start a transaction so that the list of users doesn't change while we run separate queries with user_info()
    db.begin()

    user_emails = db.query("""
        SELECT email
        FROM auth_users
        ORDER BY auth_users_id
    """).flat()

    users = []

    for email in user_emails:
        users.append(user_info(db=db, email=email))

    db.commit()

    return users
コード例 #24
0
def all_users(db: DatabaseHandler) -> List[CurrentUser]:
    """Fetch and return a list of users and their roles."""

    # Start a transaction so that the list of users doesn't change while we run separate queries with user_info()
    db.begin()

    user_emails = db.query("""
        SELECT email
        FROM auth_users
        ORDER BY auth_users_id
    """).flat()

    users = []

    for email in user_emails:
        users.append(user_info(db=db, email=email))

    db.commit()

    return users
コード例 #25
0
ファイル: media.py プロジェクト: berkmancenter/mediacloud
def _update_media_normalized_urls(db: DatabaseHandler) -> None:
    """Keep normalized_url field in media table up to date.

    Set the normalized_url field of any row in media for which it is null.  Take care to lock the process
    so that only one process is doing this work at a time.
    """
    # put a lock on this because the process of generating all media urls will take a couple hours, and we don't
    # want all workers to do the work
    locked = False
    while not locked:
        if not _normalized_urls_out_of_date(db):
            return

        db.begin()

        # poll instead of block so that we can releae the transaction and see whether someone else has already
        # updated all of the media
        locked = get_session_lock(db, 'MediaWords::TM::Media::media_normalized_urls', 1, wait=False)

        if not locked:
            db.commit()
            log.info("sleeping for media_normalized_urls lock...")
            time.sleep(1)

    log.warning("updating media_normalized_urls ...")

    media = db.query("select * from media where normalized_url is null").hashes()

    i = 0
    total = len(media)
    for medium in media:
        i += 1
        normalized_url = mediawords.util.url.normalize_url_lossy(medium['url'])
        if normalized_url is None:
            normalized_url = medium['url']

        log.info("[%d/%d] adding %s (%s)" % (i, total, medium['name'], normalized_url))

        db.update_by_id('media', medium['media_id'], {'normalized_url': normalized_url})

    db.commit()
コード例 #26
0
ファイル: stories.py プロジェクト: berkmancenter/mediacloud
def merge_foreign_rss_stories(db: DatabaseHandler, topic: dict) -> None:
    """Move all topic stories with a foreign_rss_links medium from topic_stories back to topic_seed_urls."""
    topic = decode_object_from_bytes_if_needed(topic)

    stories = db.query(
        """
        select s.*
            from stories s, topic_stories ts, media m
            where
                s.stories_id = ts.stories_id and
                s.media_id = m.media_id and
                m.foreign_rss_links = true and
                ts.topics_id = %(a)s and
                not ts.valid_foreign_rss_story
        """,
        {'a': topic['topics_id']}).hashes()

    for story in stories:
        download = db.query(
            "select * from downloads where stories_id = %(a)s order by downloads_id limit 1",
            {'a': story['stories_id']}).hash()

        content = ''
        try:
            content = mediawords.dbi.downloads.fetch_content(db, download)
        except Exception:
            pass

        db.begin()
        db.create('topic_seed_urls', {
            'url': story['url'],
            'topics_id': topic['topics_id'],
            'source': 'merge_foreign_rss_stories',
            'content': content
        })

        db.query(
            "delete from topic_stories where stories_id = %(a)s and topics_id = %(b)s",
            {'a': story['stories_id'], 'b': topic['topics_id']})
        db.commit()
コード例 #27
0
ファイル: register.py プロジェクト: berkmancenter/mediacloud
def activate_user_via_token(db: DatabaseHandler, email: str, activation_token: str) -> None:
    """Change password with a password token sent by email."""

    email = decode_object_from_bytes_if_needed(email)
    activation_token = decode_object_from_bytes_if_needed(activation_token)

    if not email:
        raise McAuthRegisterException("Email is empty.")
    if not activation_token:
        raise McAuthRegisterException('Password reset token is empty.')

    # Validate the token once more (was pre-validated in controller)
    if not password_reset_token_is_valid(db=db, email=email, password_reset_token=activation_token):
        raise McAuthRegisterException('Activation token is invalid.')

    db.begin()

    # Set the password hash
    db.query("""
        UPDATE auth_users
        SET active = TRUE
        WHERE email = %(email)s
    """, {'email': email})

    # Unset the password reset token
    db.query("""
        UPDATE auth_users
        SET password_reset_token_hash = NULL
        WHERE email = %(email)s
    """, {'email': email})

    user = user_info(db=db, email=email)

    message = AuthActivatedMessage(to=email, full_name=user.full_name())
    if not send_email(message):
        db.rollback()
        raise McAuthRegisterException("Unable to send email about an activated user.")

    db.commit()
コード例 #28
0
def run_provider(db: DatabaseHandler, daemon: bool = True) -> None:
    """Run the provider daemon to periodically add crawler_fetcher jobs by querying for pending downloads.

    Poll forever as a daemon.  Every QUEUE_INTERVAL seconds, check whether queued_downloads
    has less than MAX_QUEUE_SIZE jobs. If it does, call provide_download_ids and queue a
    fetcher job for each provided download_id.

    When run as a daemon, this function effectively throttles each host to no more than one download every
    QUEUE_INTERVAL seconds because provide_download_ids only provides one downloads_id for each host.
    """
    while True:
        queue_size = db.query(
            "select count(*) from ( select 1 from queued_downloads limit %(a)s ) q",
            {
                'a': MAX_QUEUE_SIZE * 10
            }).flat()[0]
        log.warning("queue_size: %d" % queue_size)

        if queue_size < MAX_QUEUE_SIZE:
            downloads_ids = provide_download_ids(db)
            log.warning("adding to downloads to queue: %d" %
                        len(downloads_ids))

            db.begin()
            for i in downloads_ids:
                db.query(
                    "insert into queued_downloads(downloads_id) values(%(a)s) on conflict (downloads_id) do nothing",
                    {'a': i})
            db.commit()

            if daemon:
                time.sleep(QUEUE_INTERVAL)

        elif daemon:
            time.sleep(QUEUE_INTERVAL * 10)

        if not daemon:
            break
コード例 #29
0
def update_user(db: DatabaseHandler, user_updates: ModifyUser) -> None:
    """Update an existing user."""

    if not user_updates:
        raise McAuthProfileException("Existing user is undefined.")

    # Check if user exists
    try:
        user = user_info(db=db, email=user_updates.email())
    except Exception as _:
        raise McAuthProfileException(
            'User with email address "%s" does not exist.' %
            user_updates.email())

    db.begin()

    if user_updates.full_name() is not None:
        db.query(
            """
            UPDATE auth_users
            SET full_name = %(full_name)s
            WHERE email = %(email)s
        """, {
                'full_name': user_updates.full_name(),
                'email': user_updates.email(),
            })

    if user_updates.notes() is not None:
        db.query(
            """
            UPDATE auth_users
            SET notes = %(notes)s
            WHERE email = %(email)s
        """, {
                'notes': user_updates.notes(),
                'email': user_updates.email(),
            })

    if user_updates.active() is not None:
        db.query(
            """
            UPDATE auth_users
            SET active = %(active)s
            WHERE email = %(email)s
        """, {
                'active': bool(int(user_updates.active())),
                'email': user_updates.email(),
            })

    if user_updates.password() is not None:
        try:
            change_password(
                db=db,
                email=user_updates.email(),
                new_password=user_updates.password(),
                new_password_repeat=user_updates.password_repeat(),
                do_not_inform_via_email=True,
            )
        except Exception as ex:
            db.rollback()
            raise McAuthProfileException("Unable to change password: %s" %
                                         str(ex))

    if user_updates.weekly_requests_limit() is not None:
        db.query(
            """
            UPDATE auth_user_limits
            SET weekly_requests_limit = %(weekly_requests_limit)s
            WHERE auth_users_id = %(auth_users_id)s
        """, {
                'weekly_requests_limit': user_updates.weekly_requests_limit(),
                'auth_users_id': user.user_id(),
            })

    if user_updates.weekly_requested_items_limit() is not None:
        db.query(
            """
            UPDATE auth_user_limits
            SET weekly_requested_items_limit = %(weekly_requested_items_limit)s
            WHERE auth_users_id = %(auth_users_id)s
        """, {
                'weekly_requested_items_limit':
                user_updates.weekly_requested_items_limit(),
                'auth_users_id':
                user.user_id(),
            })

    if user_updates.role_ids() is not None:
        db.query(
            """
            DELETE FROM auth_users_roles_map
            WHERE auth_users_id = %(auth_users_id)s
        """, {'auth_users_id': user.user_id()})

        for auth_roles_id in user_updates.role_ids():
            db.insert(table='auth_users_roles_map',
                      insert_hash={
                          'auth_users_id': user.user_id(),
                          'auth_roles_id': auth_roles_id,
                      })

    db.commit()
コード例 #30
0
ファイル: stories.py プロジェクト: vishalbelsare/mediacloud
def merge_foreign_rss_stories(db: DatabaseHandler, topic: dict) -> None:
    """Move all topic stories with a foreign_rss_links medium from topic_stories back to topic_seed_urls."""
    topic = decode_object_from_bytes_if_needed(topic)

    stories = db.query("""
        WITH topic_stories_from_topic AS (
            SELECT stories_id
            FROM topic_stories
            WHERE
                topics_id = %(topics_id)s AND
                (NOT valid_foreign_rss_story)
        )

        SELECT stories.*
        FROM stories
            INNER JOIN media ON
                stories.media_id = media.media_id AND
                media.foreign_rss_links
        WHERE stories.stories_id IN (
            SELECT stories_id
            FROM topic_stories_from_topic
        )
    """, {
        'topics_id': topic['topics_id'],
    }).hashes()

    for story in stories:
        download = db.query(
            """
                SELECT *
                FROM downloads
                WHERE stories_id = %(stories_id)s
                ORDER BY downloads_id
                LIMIT 1
            """,
            {
                'stories_id': story['stories_id'],
            }
        ).hash()

        content = ''
        try:
            content = fetch_content(db, download)
        except Exception as ex:
            log.warning(f"Unable to fetch content for download {download['downloads_id']}: {ex}")

        # postgres will complain if the content has a null in it
        content = content.replace('\x00', '')

        db.begin()
        db.create(
            'topic_seed_urls',
            {
                'topics_id': topic['topics_id'],
                'url': story['url'],
                'source': 'merge_foreign_rss_stories',
                'content': content,
            },
        )

        db.query(
            """
                UPDATE topic_links SET
                    ref_stories_id = NULL,
                    link_spidered = 'f'
                WHERE
                    topics_id = %(topics_id)s AND
                    ref_stories_id = %(ref_stories_id)s
            """,
            {
                'ref_stories_id': story['stories_id'],
                'topics_id': topic['topics_id'],
            },
        )

        db.query(
            """
            DELETE FROM topic_stories
            WHERE
                stories_id = %(stories_id)s AND
                topics_id = %(topics_id)s
            """,
            {
                'stories_id': story['stories_id'],
                'topics_id': topic['topics_id'],
            },
        )

        db.commit()
コード例 #31
0
ファイル: tagger.py プロジェクト: robpotter89/backend
    def update_tags_for_story(self, db: DatabaseHandler,
                              stories_id: int) -> None:
        """Add version, country and story tags for story."""

        # MC_REWRITE_TO_PYTHON: remove after rewrite to Python
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)

        stories_id = int(stories_id)

        annotation = self.__annotation_store.fetch_annotation_for_story(
            db=db, stories_id=stories_id)
        if annotation is None:
            raise McJSONAnnotationTaggerException(
                "Unable to fetch annotation for story %d" % stories_id)

        tags = None
        try:
            tags = self._tags_for_annotation(annotation)
        except Exception as ex:
            # Programming error (should at least return an empty list)
            fatal_error("Unable to fetch tags for story %d: %s" % (
                stories_id,
                str(ex),
            ))

        if tags is None:
            raise McJSONAnnotationTaggerException(
                "Returned tags is None for story %d." % stories_id)

        log.debug("Tags for story %d: %s" % (
            stories_id,
            str(tags),
        ))

        db.begin()

        unique_tag_sets_names = set()
        for tag in tags:
            tag_sets_name = self.__strip_linebreaks_and_whitespace(
                tag.tag_sets_name)
            unique_tag_sets_names.add(tag_sets_name)

        # Delete old tags the story might have under a given tag set
        db.query(
            """
            DELETE FROM stories_tags_map
            WHERE stories_id = %(stories_id)s
              AND tags_id IN (
                SELECT tags_id
                FROM tags
                WHERE tag_sets_id IN (
                  SELECT tag_sets_id
                  FROM tag_sets
                  WHERE name = ANY(%(tag_sets_names)s)
                )
              )
        """, {
                'stories_id': stories_id,
                'tag_sets_names': list(unique_tag_sets_names)
            })

        for tag in tags:
            tag_sets_name = self.__strip_linebreaks_and_whitespace(
                tag.tag_sets_name)
            tags_name = self.__strip_linebreaks_and_whitespace(tag.tags_name)

            # Not using find_or_create() because tag set / tag might already exist
            # with slightly different label / description

            # Find or create a tag set
            db_tag_set = db.select(table='tag_sets',
                                   what_to_select='*',
                                   condition_hash={
                                       'name': tag_sets_name
                                   }).hash()
            if db_tag_set is None:
                db.query(
                    """
                    INSERT INTO tag_sets (name, label, description)
                    VALUES (%(name)s, %(label)s, %(description)s)
                    ON CONFLICT (name) DO NOTHING
                """, {
                        'name': tag_sets_name,
                        'label': tag.tag_sets_label,
                        'description': tag.tag_sets_description
                    })
                db_tag_set = db.select(table='tag_sets',
                                       what_to_select='*',
                                       condition_hash={
                                           'name': tag_sets_name
                                       }).hash()
            tag_sets_id = int(db_tag_set['tag_sets_id'])

            # Find or create tag
            db_tag = db.select(table='tags',
                               what_to_select='*',
                               condition_hash={
                                   'tag_sets_id': tag_sets_id,
                                   'tag': tags_name,
                               }).hash()
            if db_tag is None:
                db.query(
                    """
                    INSERT INTO tags (tag_sets_id, tag, label, description)
                    VALUES (%(tag_sets_id)s, %(tag)s, %(label)s, %(description)s)
                    ON CONFLICT (tag, tag_sets_id) DO NOTHING
                """, {
                        'tag_sets_id': tag_sets_id,
                        'tag': tags_name,
                        'label': tag.tags_label,
                        'description': tag.tags_description,
                    })
                db_tag = db.select(table='tags',
                                   what_to_select='*',
                                   condition_hash={
                                       'tag_sets_id': tag_sets_id,
                                       'tag': tags_name,
                                   }).hash()
            tags_id = int(db_tag['tags_id'])

            # Assign story to tag (if no such mapping exists yet)
            #
            # (partitioned table's INSERT trigger will take care of conflicts)
            #
            # Not using db.create() because it tests last_inserted_id, and on duplicates there would be no such
            # "last_inserted_id" set.
            db.query(
                """
                INSERT INTO stories_tags_map (stories_id, tags_id)
                VALUES (%(stories_id)s, %(tags_id)s)
            """, {
                    'stories_id': stories_id,
                    'tags_id': tags_id,
                })

        db.commit()
コード例 #32
0
ファイル: stories.py プロジェクト: zhanglipku/mediacloud
def add_story(db: DatabaseHandler,
              story: dict,
              feeds_id: int,
              skip_checking_if_new: bool = False) -> Optional[dict]:
    """If the story is new, add story to the database with the feed of the download as story feed.

    Returns created story or None if story wasn't created.
    """

    story = decode_object_from_bytes_if_needed(story)
    if isinstance(feeds_id, bytes):
        feeds_id = decode_object_from_bytes_if_needed(feeds_id)
    feeds_id = int(feeds_id)
    if isinstance(skip_checking_if_new, bytes):
        skip_checking_if_new = decode_object_from_bytes_if_needed(
            skip_checking_if_new)
    skip_checking_if_new = bool(int(skip_checking_if_new))

    if db.in_transaction():
        raise McAddStoryException(
            "add_story() can't be run from within transaction.")

    db.begin()

    db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE")

    if not skip_checking_if_new:
        if not is_new(db=db, story=story):
            log.debug("Story '{}' is not new.".format(story['url']))
            db.commit()
            return None

    medium = db.find_by_id(table='media', object_id=story['media_id'])

    if story.get('full_text_rss', None) is None:
        story['full_text_rss'] = medium.get('full_text_rss', False) or False
        if len(story.get('description', '')) == 0:
            story['full_text_rss'] = False

    try:
        story = db.create(table='stories', insert_hash=story)
    except Exception as ex:
        db.rollback()

        # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint
        if 'unique constraint \"stories_guid' in str(ex):
            log.warning(
                "Failed to add story for '{}' to GUID conflict (guid = '{}')".
                format(story['url'], story['guid']))
            return None

        else:
            raise McAddStoryException(
                "Error adding story: {}\nStory: {}".format(
                    str(ex), str(story)))

    db.find_or_create(table='feeds_stories_map',
                      insert_hash={
                          'stories_id': story['stories_id'],
                          'feeds_id': feeds_id,
                      })

    db.commit()

    return story
コード例 #33
0
ファイル: profile.py プロジェクト: berkmancenter/mediacloud
def update_user(db: DatabaseHandler, user_updates: ModifyUser) -> None:
    """Update an existing user."""

    if not user_updates:
        raise McAuthProfileException("Existing user is undefined.")

    # Check if user exists
    try:
        user = user_info(db=db, email=user_updates.email())
    except Exception:
        raise McAuthProfileException('User with email address "%s" does not exist.' % user_updates.email())

    db.begin()

    if user_updates.full_name() is not None:
        db.query("""
            UPDATE auth_users
            SET full_name = %(full_name)s
            WHERE email = %(email)s
        """, {
            'full_name': user_updates.full_name(),
            'email': user_updates.email(),
        })

    if user_updates.notes() is not None:
        db.query("""
            UPDATE auth_users
            SET notes = %(notes)s
            WHERE email = %(email)s
        """, {
            'notes': user_updates.notes(),
            'email': user_updates.email(),
        })

    if user_updates.active() is not None:
        db.query("""
            UPDATE auth_users
            SET active = %(active)s
            WHERE email = %(email)s
        """, {
            'active': bool(int(user_updates.active())),
            'email': user_updates.email(),
        })

    if user_updates.password() is not None:
        try:
            change_password(
                db=db,
                email=user_updates.email(),
                new_password=user_updates.password(),
                new_password_repeat=user_updates.password_repeat(),
                do_not_inform_via_email=True,
            )
        except Exception as ex:
            db.rollback()
            raise McAuthProfileException("Unable to change password: %s" % str(ex))

    if user_updates.weekly_requests_limit() is not None:
        db.query("""
            UPDATE auth_user_limits
            SET weekly_requests_limit = %(weekly_requests_limit)s
            WHERE auth_users_id = %(auth_users_id)s
        """, {
            'weekly_requests_limit': user_updates.weekly_requests_limit(),
            'auth_users_id': user.user_id(),
        })

    if user_updates.weekly_requested_items_limit() is not None:
        db.query("""
            UPDATE auth_user_limits
            SET weekly_requested_items_limit = %(weekly_requested_items_limit)s
            WHERE auth_users_id = %(auth_users_id)s
        """, {
            'weekly_requested_items_limit': user_updates.weekly_requested_items_limit(),
            'auth_users_id': user.user_id(),
        })

    if user_updates.role_ids() is not None:
        db.query("""
            DELETE FROM auth_users_roles_map
            WHERE auth_users_id = %(auth_users_id)s
        """, {'auth_users_id': user.user_id()})

        for auth_roles_id in user_updates.role_ids():
            db.insert(table='auth_users_roles_map', insert_hash={
                'auth_users_id': user.user_id(),
                'auth_roles_id': auth_roles_id,
            })

    db.commit()
コード例 #34
0
def add_user(db: DatabaseHandler, new_user: NewUser) -> None:
    """Add new user."""

    if not new_user:
        raise McAuthRegisterException("New user is undefined.")

    # Check if user already exists
    user_exists = db.query(
        """
        SELECT auth_users_id
        FROM auth_users
        WHERE email = %(email)s
        LIMIT 1
    """, {
            'email': new_user.email()
        }).hash()

    if user_exists is not None and 'auth_users_id' in user_exists:
        raise McAuthRegisterException("User with email '%s' already exists." %
                                      new_user.email())

    # Hash + validate the password
    try:
        password_hash = generate_secure_hash(password=new_user.password())
        if not password_hash:
            raise McAuthRegisterException("Password hash is empty.")
    except Exception as _:
        raise McAuthRegisterException('Unable to hash a new password.')

    db.begin()

    # Create the user
    db.create(table='auth_users',
              insert_hash={
                  'email': new_user.email(),
                  'password_hash': password_hash,
                  'full_name': new_user.full_name(),
                  'notes': new_user.notes(),
                  'active': bool(int(new_user.active())),
              })

    # Fetch the user's ID
    try:
        user = user_info(db=db, email=new_user.email())
    except Exception as ex:
        db.rollback()
        raise McAuthRegisterException(
            "I've attempted to create the user but it doesn't exist: %s" %
            str(ex))

    # Create roles
    try:
        for auth_roles_id in new_user.role_ids():
            db.create(table='auth_users_roles_map',
                      insert_hash={
                          'auth_users_id': user.user_id(),
                          'auth_roles_id': auth_roles_id,
                      })
    except Exception as ex:
        raise McAuthRegisterException("Unable to create roles: %s" % str(ex))

    # Update limits (if they're defined)
    if new_user.weekly_requests_limit() is not None:
        db.query(
            """
            UPDATE auth_user_limits
            SET weekly_requests_limit = %(weekly_requests_limit)s
            WHERE auth_users_id = %(auth_users_id)s
        """, {
                'auth_users_id': user.user_id(),
                'weekly_requests_limit': new_user.weekly_requests_limit(),
            })

    if new_user.weekly_requested_items_limit() is not None:
        db.query(
            """
            UPDATE auth_user_limits
            SET weekly_requested_items_limit = %(weekly_requested_items_limit)s
            WHERE auth_users_id = %(auth_users_id)s
        """, {
                'auth_users_id':
                user.user_id(),
                'weekly_requested_items_limit':
                new_user.weekly_requested_items_limit(),
            })

    # Subscribe to newsletter
    if new_user.subscribe_to_newsletter():
        db.create(table='auth_users_subscribe_to_newsletter',
                  insert_hash={'auth_users_id': user.user_id()})

    if not new_user.active():
        send_user_activation_token(
            db=db,
            email=new_user.email(),
            activation_link=new_user.activation_url(),
            subscribe_to_newsletter=new_user.subscribe_to_newsletter(),
        )

    db.commit()
コード例 #35
0
ファイル: __init__.py プロジェクト: nzufelt/mediacloud
    def update_tags_for_story(self, db: DatabaseHandler,
                              stories_id: int) -> None:
        """Add version, country and story tags for story."""

        if not self.annotator_is_enabled():
            fatal_error("Annotator is not enabled in the configuration.")

        # MC_REWRITE_TO_PYTHON: remove after rewrite to Python
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)

        stories_id = int(stories_id)

        annotation = self.fetch_annotation_for_story(db=db,
                                                     stories_id=stories_id)
        if annotation is None:
            raise McJSONAnnotatorException(
                "Unable to fetch annotation for story %d" % stories_id)

        tags = None
        try:
            tags = self._tags_for_annotation(annotation)
        except Exception as ex:
            # Programming error (should at least return an empty list)
            fatal_error("Unable to fetch tags for story %d: %s" % (
                stories_id,
                str(ex),
            ))

        if tags is None:
            raise McJSONAnnotatorException(
                "Returned tags is None for story %d." % stories_id)

        log.debug("Tags for story %d: %s" % (
            stories_id,
            str(tags),
        ))

        db.begin()

        # Delete old tags the story might have under a given tag set
        for tag in tags:
            tag_sets_name = self.__strip_linebreaks_and_whitespace(
                tag.tag_sets_name)
            db.query(
                """
                DELETE FROM stories_tags_map
                    USING tags, tag_sets
                WHERE stories_tags_map.tags_id = tags.tags_id
                  AND tags.tag_sets_id = tag_sets.tag_sets_id
                  AND stories_tags_map.stories_id = %(stories_id)s
                  AND tag_sets.name = %(tag_sets_name)s
            """, {
                    'stories_id': stories_id,
                    'tag_sets_name': tag_sets_name
                })

        for tag in tags:
            tag_sets_name = self.__strip_linebreaks_and_whitespace(
                tag.tag_sets_name)
            tags_name = self.__strip_linebreaks_and_whitespace(tag.tags_name)

            # Not using find_or_create() because tag set / tag might already exist
            # with slightly different label / description

            # Create tag set
            db_tag_set = db.select(table='tag_sets',
                                   what_to_select='*',
                                   condition_hash={
                                       'name': tag_sets_name
                                   }).hash()
            if db_tag_set is None:
                db.query(
                    """
                    INSERT INTO tag_sets (name, label, description)
                    VALUES (%(name)s, %(label)s, %(description)s)
                    ON CONFLICT (name) DO NOTHING
                """, {
                        'name': tag_sets_name,
                        'label': tag.tag_sets_label,
                        'description': tag.tag_sets_description
                    })
                db_tag_set = db.select(table='tag_sets',
                                       what_to_select='*',
                                       condition_hash={
                                           'name': tag_sets_name
                                       }).hash()
            tag_sets_id = int(db_tag_set['tag_sets_id'])

            # Create tag
            db_tag = db.select(table='tags',
                               what_to_select='*',
                               condition_hash={
                                   'tag_sets_id': tag_sets_id,
                                   'tag': tags_name,
                               }).hash()
            if db_tag is None:
                db.query(
                    """
                    INSERT INTO tags (tag_sets_id, tag, label, description)
                    VALUES (%(tag_sets_id)s, %(tag)s, %(label)s, %(description)s)
                    ON CONFLICT (tag, tag_sets_id) DO NOTHING
                """, {
                        'tag_sets_id': tag_sets_id,
                        'tag': tags_name,
                        'label': tag.tags_label,
                        'description': tag.tags_description,
                    })
                db_tag = db.select(table='tags',
                                   what_to_select='*',
                                   condition_hash={
                                       'tag_sets_id': tag_sets_id,
                                       'tag': tags_name,
                                   }).hash()
            tags_id = int(db_tag['tags_id'])

            # Assign story to tag (if no such mapping exists yet)
            db.query(
                """
                INSERT INTO stories_tags_map (stories_id, tags_id)
                VALUES (%(stories_id)s, %(tags_id)s)
                ON CONFLICT (stories_id, tags_id) DO NOTHING
            """, {
                    'stories_id': stories_id,
                    'tags_id': tags_id,
                })

        db.commit()
コード例 #36
0
ファイル: register.py プロジェクト: berkmancenter/mediacloud
def add_user(db: DatabaseHandler, new_user: NewUser) -> None:
    """Add new user."""

    if not new_user:
        raise McAuthRegisterException("New user is undefined.")

    # Check if user already exists
    user_exists = db.query("""
        SELECT auth_users_id
        FROM auth_users
        WHERE email = %(email)s
        LIMIT 1
    """, {'email': new_user.email()}).hash()

    if user_exists is not None and 'auth_users_id' in user_exists:
        raise McAuthRegisterException("User with email '%s' already exists." % new_user.email())

    # Hash + validate the password
    try:
        password_hash = generate_secure_hash(password=new_user.password())
        if not password_hash:
            raise McAuthRegisterException("Password hash is empty.")
    except Exception as ex:
        log.error("Unable to hash a new password: {}".format(ex))
        raise McAuthRegisterException('Unable to hash a new password.')

    db.begin()

    # Create the user
    db.create(
        table='auth_users',
        insert_hash={
            'email': new_user.email(),
            'password_hash': password_hash,
            'full_name': new_user.full_name(),
            'notes': new_user.notes(),
            'active': bool(int(new_user.active())),
        }
    )

    # Fetch the user's ID
    try:
        user = user_info(db=db, email=new_user.email())
    except Exception as ex:
        db.rollback()
        raise McAuthRegisterException("I've attempted to create the user but it doesn't exist: %s" % str(ex))

    # Create roles
    try:
        for auth_roles_id in new_user.role_ids():
            db.create(table='auth_users_roles_map', insert_hash={
                'auth_users_id': user.user_id(),
                'auth_roles_id': auth_roles_id,
            })
    except Exception as ex:
        raise McAuthRegisterException("Unable to create roles: %s" % str(ex))

    # Update limits (if they're defined)
    if new_user.weekly_requests_limit() is not None:
        db.query("""
            UPDATE auth_user_limits
            SET weekly_requests_limit = %(weekly_requests_limit)s
            WHERE auth_users_id = %(auth_users_id)s
        """, {
            'auth_users_id': user.user_id(),
            'weekly_requests_limit': new_user.weekly_requests_limit(),
        })

    if new_user.weekly_requested_items_limit() is not None:
        db.query("""
            UPDATE auth_user_limits
            SET weekly_requested_items_limit = %(weekly_requested_items_limit)s
            WHERE auth_users_id = %(auth_users_id)s
        """, {
            'auth_users_id': user.user_id(),
            'weekly_requested_items_limit': new_user.weekly_requested_items_limit(),
        })

    # Subscribe to newsletter
    if new_user.subscribe_to_newsletter():
        db.create(table='auth_users_subscribe_to_newsletter', insert_hash={'auth_users_id': user.user_id()})

    if not new_user.active():
        send_user_activation_token(
            db=db,
            email=new_user.email(),
            activation_link=new_user.activation_url(),
            subscribe_to_newsletter=new_user.subscribe_to_newsletter(),
        )

    db.commit()
コード例 #37
0
def add_story(db: DatabaseHandler, story: dict,
              feeds_id: int) -> Optional[dict]:
    """Return an existing dup story if it matches the url, guid, or title; otherwise, add a new story and return it.

    Returns found or created story. Adds an is_new = True story if the story was created by the call.
    """

    story = decode_object_from_bytes_if_needed(story)
    if isinstance(feeds_id, bytes):
        feeds_id = decode_object_from_bytes_if_needed(feeds_id)
    feeds_id = int(feeds_id)

    if db.in_transaction():
        raise McAddStoryException(
            "add_story() can't be run from within transaction.")

    db.begin()

    db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE")

    db_story = find_dup_story(db, story)
    if db_story:
        log.debug("found existing dup story: %s [%s]" %
                  (story['title'], story['url']))
        db.commit()
        return db_story

    medium = db.find_by_id(table='media', object_id=story['media_id'])

    if story.get('full_text_rss', None) is None:
        story['full_text_rss'] = medium.get('full_text_rss', False) or False
        if len(story.get('description', '')) == 0:
            story['full_text_rss'] = False

    try:
        story = db.create(table='stories', insert_hash=story)
    except Exception as ex:
        db.rollback()

        # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint
        if 'unique constraint \"stories_guid' in str(ex):
            log.warning(
                "Failed to add story for '{}' to GUID conflict (guid = '{}')".
                format(story['url'], story['guid']))
            return None

        else:
            raise McAddStoryException(
                "Error adding story: {}\nStory: {}".format(
                    str(ex), str(story)))

    story['is_new'] = True

    [insert_story_urls(db, story, u) for u in (story['url'], story['guid'])]

    # on conflict does not work with partitioned feeds_stories_map
    db.query(
        """
        insert into feeds_stories_map_p ( feeds_id, stories_id )
            select %(a)s, %(b)s where not exists (
                select 1 from feeds_stories_map where feeds_id = %(a)s and stories_id = %(b)s )
        """, {
            'a': feeds_id,
            'b': story['stories_id']
        })

    db.commit()

    log.debug("added story: %s" % story['url'])

    return story
コード例 #38
0
ファイル: __init__.py プロジェクト: zhanglipku/mediacloud
    def update_tags_for_story(self, db: DatabaseHandler, stories_id: int) -> None:
        """Add version, country and story tags for story."""

        if not self.annotator_is_enabled():
            fatal_error("Annotator is not enabled in the configuration.")

        # MC_REWRITE_TO_PYTHON: remove after rewrite to Python
        if isinstance(stories_id, bytes):
            stories_id = decode_object_from_bytes_if_needed(stories_id)

        stories_id = int(stories_id)

        annotation = self.fetch_annotation_for_story(db=db, stories_id=stories_id)
        if annotation is None:
            raise McJSONAnnotatorException("Unable to fetch annotation for story %d" % stories_id)

        tags = None
        try:
            tags = self._tags_for_annotation(annotation)
        except Exception as ex:
            # Programming error (should at least return an empty list)
            fatal_error("Unable to fetch tags for story %d: %s" % (stories_id, str(ex),))

        if tags is None:
            raise McJSONAnnotatorException("Returned tags is None for story %d." % stories_id)

        log.debug("Tags for story %d: %s" % (stories_id, str(tags),))

        db.begin()

        unique_tag_sets_names = set()
        for tag in tags:
            tag_sets_name = self.__strip_linebreaks_and_whitespace(tag.tag_sets_name)
            unique_tag_sets_names.add(tag_sets_name)

        # Delete old tags the story might have under a given tag set
        db.query("""
            DELETE FROM stories_tags_map
            WHERE stories_id = %(stories_id)s
              AND tags_id IN (
                SELECT tags_id
                FROM tags
                WHERE tag_sets_id IN (
                  SELECT tag_sets_id
                  FROM tag_sets
                  WHERE name = ANY(%(tag_sets_names)s)
                )
              )
        """, {'stories_id': stories_id, 'tag_sets_names': list(unique_tag_sets_names)})

        for tag in tags:
            tag_sets_name = self.__strip_linebreaks_and_whitespace(tag.tag_sets_name)
            tags_name = self.__strip_linebreaks_and_whitespace(tag.tags_name)

            # Not using find_or_create() because tag set / tag might already exist
            # with slightly different label / description

            # Find or create a tag set
            db_tag_set = db.select(table='tag_sets', what_to_select='*', condition_hash={'name': tag_sets_name}).hash()
            if db_tag_set is None:
                db.query("""
                    INSERT INTO tag_sets (name, label, description)
                    VALUES (%(name)s, %(label)s, %(description)s)
                    ON CONFLICT (name) DO NOTHING
                """, {
                    'name': tag_sets_name,
                    'label': tag.tag_sets_label,
                    'description': tag.tag_sets_description
                })
                db_tag_set = db.select(table='tag_sets',
                                       what_to_select='*',
                                       condition_hash={'name': tag_sets_name}).hash()
            tag_sets_id = int(db_tag_set['tag_sets_id'])

            # Find or create tag
            db_tag = db.select(table='tags', what_to_select='*', condition_hash={
                'tag_sets_id': tag_sets_id,
                'tag': tags_name,
            }).hash()
            if db_tag is None:
                db.query("""
                    INSERT INTO tags (tag_sets_id, tag, label, description)
                    VALUES (%(tag_sets_id)s, %(tag)s, %(label)s, %(description)s)
                    ON CONFLICT (tag, tag_sets_id) DO NOTHING
                """, {
                    'tag_sets_id': tag_sets_id,
                    'tag': tags_name,
                    'label': tag.tags_label,
                    'description': tag.tags_description,
                })
                db_tag = db.select(table='tags', what_to_select='*', condition_hash={
                    'tag_sets_id': tag_sets_id,
                    'tag': tags_name,
                }).hash()
            tags_id = int(db_tag['tags_id'])

            # Assign story to tag (if no such mapping exists yet)
            # (partitioned table's INSERT trigger will take care of conflicts)
            #
            # db.create() can't be used here because:
            #
            # 1) Master table for partitioned table might not have a primary key itself, only the partitions do --
            #    FIXME maybe master tables should have primary keys? Or let's wait for when we move to PostgreSQL 10+.
            #
            # 2) Partitioned table's INSERT trigger doesn't return last_inserted_id which db.create() requires
            #    FIXME there might be a way for it to return the inserted row
            #
            db.query("""
                INSERT INTO stories_tags_map (stories_id, tags_id)
                VALUES (%(stories_id)s, %(tags_id)s)
            """, {
                'stories_id': stories_id,
                'tags_id': tags_id,
            })

        db.commit()
コード例 #39
0
def update_story_sentences_and_language(
    db: DatabaseHandler,
    story: dict,
    extractor_args: PyExtractorArguments = PyExtractorArguments()
) -> None:
    """Update story vectors for the given story, updating "story_sentences".

    If extractor_args.no_delete() is True, do not try to delete existing entries in the above table before creating new
    ones (useful for optimization if you are very sure no story vectors exist for this story).

    If extractor_args.no_dedup_sentences() is True, do not perform sentence deduplication (useful if you are
    reprocessing a small set of stories).
    """

    story = decode_object_from_bytes_if_needed(story)

    use_transaction = not db.in_transaction()

    if use_transaction:
        db.begin()

    stories_id = story['stories_id']

    if not extractor_args.no_delete():
        _delete_story_sentences(db=db, story=story)

    story_text = story.get('story_text', None)
    if not story_text:
        story_text = get_text_for_word_counts(db=db, story=story)
        if not story_text:
            story_text = ''

    story_lang = language_code_for_text(text=story_text)

    sentences = _get_sentences_from_story_text(story_text=story_text,
                                               story_lang=story_lang)

    if (not story.get('language',
                      None)) or story.get('language', None) != story_lang:
        db.query(
            """
            UPDATE stories
            SET language = %(story_lang)s
            WHERE stories_id = %(stories_id)s
        """, {
                'stories_id': stories_id,
                'story_lang': story_lang
            })
        story['language'] = story_lang

    if sentences is None:
        raise McUpdateStorySentencesAndLanguageException(
            "Sentences for story {} are undefined.".format(stories_id))

    if len(sentences) == 0:
        log.debug("Story {} doesn't have any sentences.".format(stories_id))
        return

    sentences = _clean_sentences(sentences)

    _insert_story_sentences(
        db=db,
        story=story,
        sentences=sentences,
        no_dedup_sentences=extractor_args.no_dedup_sentences(),
    )

    story['ap_syndicated'] = _update_ap_syndicated(
        db=db,
        stories_id=stories_id,
        story_title=story['title'],
        story_text=story_text,
        story_language=story_lang,
    )

    if use_transaction:
        db.commit()
コード例 #40
0
ファイル: media.py プロジェクト: ibrahimhaleemkhan/mediacloud
def _update_media_normalized_urls(db: DatabaseHandler) -> None:
    """Keep media_normalized_urls table up to date.

    This function compares the media and versions in media_normalized_urls against the version returned
    by mediawords.util.url.normalize_url_lossy_version() and updates or inserts rows for any media that do not
    have up to date versions.

    """
    if not _normalized_urls_out_of_date(db):
        return

    # put a lock on this because the process of generating all media urls will take around 30 seconds, and we don't
    # want all workers to do the work
    db.begin()
    db.query("lock media_normalized_urls in access exclusive mode")

    if not _normalized_urls_out_of_date(db):
        db.commit()
        return

    log.warning("updating media_normalized_urls ...")

    version = mediawords.util.url.normalize_url_lossy_version()

    media = db.query(
        """
        select m.*
            from media m
                left join media_normalized_urls u on
                    ( m.media_id = u.media_id and u.normalize_url_lossy_version = %(a)s)
            where
                u.normalized_url is null or
                u.db_row_last_updated < m.db_row_last_updated
        """, {
            'a': version
        }).hashes()

    i = 0
    total = len(media)
    for medium in media:
        i += 1
        normalized_url = mediawords.util.url.normalize_url_lossy(medium['url'])
        if normalized_url is None:
            normalized_url = medium['url']

        log.info("[%d/%d] adding %s (%s)" %
                 (i, total, medium['name'], normalized_url))

        db.query(
            "delete from media_normalized_urls where media_id = %(a)s and normalize_url_lossy_version = %(b)s",
            {
                'a': medium['media_id'],
                'b': version
            })
        db.create(
            'media_normalized_urls', {
                'media_id': medium['media_id'],
                'normalized_url': normalized_url,
                'normalize_url_lossy_version': version
            })

    db.commit()