Exemple #1
0
def add_story(db: DatabaseHandler, story: dict, feeds_id: int, skip_checking_if_new: bool = False) -> Optional[dict]:
    """If the story is new, add story to the database with the feed of the download as story feed.

    Returns created story or None if story wasn't created.
    """

    story = decode_object_from_bytes_if_needed(story)
    if isinstance(feeds_id, bytes):
        feeds_id = decode_object_from_bytes_if_needed(feeds_id)
    feeds_id = int(feeds_id)
    if isinstance(skip_checking_if_new, bytes):
        skip_checking_if_new = decode_object_from_bytes_if_needed(skip_checking_if_new)
    skip_checking_if_new = bool(int(skip_checking_if_new))

    if db.in_transaction():
        raise McAddStoryException("add_story() can't be run from within transaction.")

    db.begin()

    db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE")

    if not skip_checking_if_new:
        if not is_new(db=db, story=story):
            log.debug("Story '{}' is not new.".format(story['url']))
            db.commit()
            return None

    medium = db.find_by_id(table='media', object_id=story['media_id'])

    if story.get('full_text_rss', None) is None:
        story['full_text_rss'] = medium.get('full_text_rss', False) or False
        if len(story.get('description', '')) == 0:
            story['full_text_rss'] = False

    try:
        story = db.create(table='stories', insert_hash=story)
    except Exception as ex:
        db.rollback()

        # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint
        if 'unique constraint \"stories_guid' in str(ex):
            log.warning(
                "Failed to add story for '{}' to GUID conflict (guid = '{}')".format(story['url'], story['guid'])
            )
            return None

        else:
            raise McAddStoryException("Error adding story: {}\nStory: {}".format(str(ex), str(story)))

    db.find_or_create(
        table='feeds_stories_map',
        insert_hash={
            'stories_id': story['stories_id'],
            'feeds_id': feeds_id,
        }
    )

    db.commit()

    return story
def create(db: DatabaseHandler, download: dict, extract: dict) -> dict:
    """Create a download_text hash and insert it into the database. Delete any existing download_text row for the
    download."""

    # FIXME don't pass freeform "extract" dict, we need just the "extracted_text"

    download = decode_object_from_bytes_if_needed(download)
    extract = decode_object_from_bytes_if_needed(extract)

    db.query("""
        DELETE FROM download_texts
        WHERE downloads_id = %(downloads_id)s
    """, {'downloads_id': download['downloads_id']})

    download_text = db.query("""
        INSERT INTO download_texts (downloads_id, download_text, download_text_length)
        VALUES (%(downloads_id)s, %(download_text)s, CHAR_LENGTH(%(download_text)s))
        RETURNING *
    """, {
        'downloads_id': download['downloads_id'],
        'download_text': extract['extracted_text'],
    }).hash()

    db.query("""
        UPDATE downloads
        SET extracted = 't'
        WHERE downloads_id = %(downloads_id)s
    """, {'downloads_id': download['downloads_id']})

    return download_text
def update_extractor_version_tag(db: DatabaseHandler, story: dict) -> None:
    """Add extractor version tag to the story."""
    # FIXME no caching because unit tests run in the same process so a cached tag set / tag will not be recreated.
    # Purging such a cache manually is very error-prone.

    story = decode_object_from_bytes_if_needed(story)

    tag_set = db.find_or_create(table='tag_sets', insert_hash={'name': extractor_version_tag_sets_name()})

    db.query("""
        DELETE FROM stories_tags_map AS stm
            USING tags AS t
                JOIN tag_sets AS ts
                    ON ts.tag_sets_id = t.tag_sets_id
        WHERE t.tags_id = stm.tags_id
          AND ts.tag_sets_id = %(tag_sets_id)s
          AND stm.stories_id = %(stories_id)s
    """, {
        'tag_sets_id': tag_set['tag_sets_id'],
        'stories_id': story['stories_id'],
    })

    extractor_version = extractor_name()
    tag = db.find_or_create(table='tags', insert_hash={'tag': extractor_version, 'tag_sets_id': tag_set['tag_sets_id']})
    tags_id = tag['tags_id']

    db.query("""
        INSERT INTO stories_tags_map (stories_id, tags_id)
        VALUES (%(stories_id)s, %(tags_id)s)
    """, {'stories_id': story['stories_id'], 'tags_id': tags_id})
def get_links_from_story_text(db: DatabaseHandler, story: dict) -> typing.List[str]:
    """Get all urls that appear in the text or description of the story using a simple regex."""
    download_ids = db.query("""
        SELECT downloads_id
        FROM downloads
        WHERE stories_id = %(stories_id)s
        """, {'stories_id': story['stories_id']}
    ).flat()

    download_texts = db.query("""
        SELECT *
        FROM download_texts
        WHERE downloads_id = ANY(%(download_ids)s)
        ORDER BY download_texts_id
        """, {'download_ids': download_ids}
    ).hashes()

    story_text = ' '.join([dt['download_text'] for dt in download_texts])

    story_text = story_text + ' ' + str(story['title']) if story['title'] is not None else story_text
    story_text = story_text + ' ' + str(story['description']) if story['description'] is not None else story_text

    links = []
    for url in re.findall(r'https?://[^\s\")]+', story_text):
        url = re.sub(r'\W+$', '', url)
        links.append(url)

    return links
def _fetch_tweets_for_day(
        db: DatabaseHandler,
        twitter_class: typing.Type[AbstractTwitter],
        topic: dict,
        topic_tweet_day: dict,
        max_tweets: typing.Optional[int] = None) -> None:
    """
    Fetch tweets for a single day.

    If tweets_fetched is false for the given topic_tweet_days row, fetch the tweets for the given day by querying
    the list of tweets from CH and then fetching each tweet from twitter.

    Arguments:
    db - db handle
    twitter_class - AbstractTwitter class
    topic - topic dict
    topic_tweet_day - topic_tweet_day dict
    max_tweets - max tweets to fetch for a single day

    Return:
    None
    """
    if topic_tweet_day['tweets_fetched']:
        return

    ch_posts_data = topic_tweet_day['ch_posts']

    ch_posts = ch_posts_data['posts']

    if (max_tweets is not None):
        ch_posts = ch_posts[0:max_tweets]

    log.info("adding %d tweets for topic %s, day %s" % (len(ch_posts), topic['topics_id'], topic_tweet_day['day']))

    # we can only get 100 posts at a time from twitter
    for i in range(0, len(ch_posts), 100):
        _add_tweets_to_ch_posts(twitter_class, ch_posts[i:i + 100])

    ch_posts = list(filter(lambda p: _post_matches_pattern(topic, p), ch_posts))

    log.info("%d tweets remaining after match" % (len(ch_posts)))

    db.begin()

    log.debug("inserting into topic_tweets ...")

    [_store_tweet_and_urls(db, topic_tweet_day, ch_post) for ch_post in ch_posts]

    topic_tweet_day['num_ch_tweets'] = len(ch_posts)

    db.query(
        "update topic_tweet_days set tweets_fetched = true, num_ch_tweets = %(a)s where topic_tweet_days_id = %(b)s",
        {'a': topic_tweet_day['num_ch_tweets'], 'b': topic_tweet_day['topic_tweet_days_id']})

    db.commit()

    log.debug("done inserting into topic_tweets")
Exemple #6
0
def lookup_medium(db: DatabaseHandler, url: str, name: str) -> typing.Optional[dict]:
    """Lookup a media source by normalized url and then name.

    Uses mediawords.util.url.normalize_url_lossy to normalize urls.  Returns the parent media for duplicate media
    sources and returns no media that are marked foreign_rss_links.

    This function queries the media.normalized_url field to find the matching urls.  Because the normalization
    function is in python, we have to keep that denormalized_url field current from within python.  This function
    is responsible for keeping the table up to date by filling the field for any media for which it is null.
    Arguments:
    db - db handle
    url - url to lookup
    name - name to lookup

    Returns:
    a media source dict or None

    """
    _update_media_normalized_urls(db)

    nu = _normalize_url(url)

    lookup_query = \
        """
        select m.*
            from media m
            where
                m.normalized_url = %(a)s and
                foreign_rss_links = 'f'
            order by dup_media_id asc nulls last, media_id asc
        """

    medium = db.query(lookup_query, {'a': nu}).hash()

    if medium is None:
        medium = db.query(
            "select m.* from media m where lower(m.name) = lower(%(a)s) and m.foreign_rss_links = false",
            {'a': name}).hash()

    if medium is None:
        return None

    if medium['dup_media_id'] is not None:

        media_cycle_lookup = dict()  # type: dict
        while medium['dup_media_id'] is not None:
            if medium['media_id'] in media_cycle_lookup:
                raise McTopicMediaException('Cycle found in duplicate media path: ' + str(media_cycle_lookup.keys()))
            media_cycle_lookup[medium['media_id']] = True

            medium = db.query("select * from media where media_id = %(a)s", {'a': medium['dup_media_id']}).hash()

    if medium['foreign_rss_links']:
        raise McTopicMediaException('Parent duplicate media source %d has foreign_rss_links' % medium['media_id'])

    return medium
def _insert_tweet_urls(db: DatabaseHandler, topic_tweet: dict, urls: typing.List) -> typing.List:
    """Insert list of urls into topic_tweet_urls."""
    for url in urls:
        db.query(
            """
            insert into topic_tweet_urls( topic_tweets_id, url )
                values( %(a)s, %(b)s )
                on conflict do nothing
            """,
            {'a': topic_tweet['topic_tweets_id'], 'b': url})
def change_password(db: DatabaseHandler,
                    email: str,
                    new_password: str,
                    new_password_repeat: str,
                    do_not_inform_via_email: bool = False) -> None:
    """Change user's password."""

    email = decode_object_from_bytes_if_needed(email)
    new_password = decode_object_from_bytes_if_needed(new_password)
    new_password_repeat = decode_object_from_bytes_if_needed(new_password_repeat)

    if isinstance(do_not_inform_via_email, bytes):
        do_not_inform_via_email = decode_object_from_bytes_if_needed(do_not_inform_via_email)

    do_not_inform_via_email = bool(int(do_not_inform_via_email))

    # Check if user exists
    try:
        user = user_info(db=db, email=email)
    except Exception:
        raise McAuthChangePasswordException('User with email address "%s" does not exist.' % email)

    password_validation_message = validate_new_password(email=email,
                                                        password=new_password,
                                                        password_repeat=new_password_repeat)
    if password_validation_message:
        raise McAuthChangePasswordException("Unable to change password: %s" % password_validation_message)

    # Hash + validate the password
    try:
        password_new_hash = generate_secure_hash(password=new_password)
    except Exception as ex:
        raise McAuthChangePasswordException("Unable to hash a new password: %s" % str(ex))

    if not password_new_hash:
        raise McAuthChangePasswordException("Generated password hash is empty.")

    # Set the password hash
    db.query("""
        UPDATE auth_users
        SET password_hash = %(password_hash)s,
            active = TRUE
        WHERE email = %(email)s
    """, {
        'email': email,
        'password_hash': password_new_hash,
    })

    if not do_not_inform_via_email:

        message = AuthPasswordChangedMessage(to=email, full_name=user.full_name())
        if not send_email(message):
            raise McAuthChangePasswordException(
                'The password has been changed, but I was unable to send an email notifying you about the change.'
            )
    def __remove_object_from_cache(self, db: DatabaseHandler, object_id: int) -> None:
        """Attempt to remove object from cache.

        Raise if removal fails because after removal we'd expect the object to be gone for good."""

        object_id = self._prepare_object_id(object_id)

        sql = "DELETE FROM %s " % self.__cache_table  # interpolated by Python
        sql += "WHERE object_id = %(object_id)s"  # interpolated by psycopg2

        db.query(sql, {'object_id': object_id})
Exemple #10
0
def __get_topic_url_variants(db: DatabaseHandler, urls: List[str]) -> List[str]:
    """Get any alternative urls for the given url from topic_merged_stories or topic_links."""

    urls = decode_object_from_bytes_if_needed(urls)

    # MC_REWRITE_TO_PYTHON: change to tuple parameter because Perl database handler proxy can't handle tuples
    stories_ids_sql = "SELECT stories_id "
    stories_ids_sql += "FROM stories "
    stories_ids_sql += "WHERE url = ANY(?)"
    stories_ids = db.query(stories_ids_sql, urls).flat()

    # MC_REWRITE_TO_PYTHON: Perl database handler proxy (the dreaded "wantarray" part) returns None on empty result
    # sets, a scalar on a single item and arrayref on many items
    if stories_ids is None:
        stories_ids = []
    elif isinstance(stories_ids, int):
        stories_ids = [stories_ids]

    stories_ids = [int(x) for x in stories_ids]

    all_stories_ids = __get_merged_stories_ids(db=db, stories_ids=stories_ids)
    if len(all_stories_ids) == 0:
        return urls

    all_urls = db.query("""
        SELECT DISTINCT url
        FROM (
            SELECT redirect_url AS url
            FROM topic_links
            WHERE ref_stories_id = ANY(?)

            UNION

            SELECT url
            FROM topic_links
            WHERE ref_stories_id = ANY(?)

            UNION

            SELECT url
            FROM stories
            WHERE stories_id = ANY(?)
        ) AS q
        WHERE q IS NOT NULL
    """, all_stories_ids, all_stories_ids, all_stories_ids).flat()

    # MC_REWRITE_TO_PYTHON: Perl database handler proxy (the dreaded "wantarray" part) returns None on empty result
    # sets, a scalar on a single item and arrayref on many items
    if all_urls is None:
        all_urls = []
    elif isinstance(all_urls, str):
        all_urls = [all_urls]

    return all_urls
Exemple #11
0
def create_password_reset_token(db: DatabaseHandler, email: str) -> Optional[str]:
    """Generate password reset token used for both activating newly registered users and resetting passwords.

    Returns non-hashed password reset token or None if user was not found.
    """

    email = decode_object_from_bytes_if_needed(email)

    if not email:
        raise McAuthProfileException('Email address is empty.')

    # Check if the email address exists in the user table; if not, pretend that we sent the activation link with a
    # "success" message. That way the adversary would not be able to find out which email addresses are active users.
    #
    # (Possible improvement: make the script work for the exact same amount of time in both cases to avoid timing
    # attacks)
    user_exists = db.query("""
        SELECT auth_users_id,
               email
        FROM auth_users
        WHERE email = %(email)s
        LIMIT 1
    """, {'email': email}).hash()
    if user_exists is None or len(user_exists) == 0:
        # User was not found, so set the email address to an empty string, but don't return just now and continue with a
        # rather slowish process of generating a activation token (in order to reduce the risk of timing attacks)
        email = ''

    # Generate the activation token
    password_reset_token = random_string(length=64)
    if len(password_reset_token) == 0:
        raise McAuthProfileException('Unable to generate an activation token.')

    # Hash + validate the activation token
    password_reset_token_hash = generate_secure_hash(password=password_reset_token)
    if not password_reset_token_hash:
        raise McAuthProfileException("Unable to hash an activation token.")

    # Set the activation token hash in the database (if the email address doesn't exist, this query will do nothing)
    db.query("""
        UPDATE auth_users
        SET password_reset_token_hash = %(password_reset_token_hash)s
        WHERE email = %(email)s
          AND email != ''
    """, {
        'email': email,
        'password_reset_token_hash': password_reset_token_hash,
    })

    return password_reset_token
Exemple #12
0
def get_consistent_color(db: DatabaseHandler, item_set: str, item_id: str) -> str:
    """Return the same hex color (e.g. "ff0000" for the same set / ID combination every time this function is called."""
    item_set = decode_object_from_bytes_if_needed(item_set)
    item_id = decode_object_from_bytes_if_needed(item_id)

    # Always return grey for null or not typed values
    if item_id.lower() in {'null', 'not typed'}:
        return '999999'

    color = db.query("""SELECT color FROM color_sets WHERE color_set = %(item_set)s AND id = %(item_id)s""", {
        'item_set': item_set,
        'item_id': item_id,
    }).flat()
    if color is not None and len(color):
        if isinstance(color, list):
            color = color[0]
        return color

    set_colors = db.query("""SELECT color FROM color_sets WHERE color_set = %(item_set)s""", {
        'item_set': item_set,
    }).flat()
    if set_colors is not None:
        if not isinstance(set_colors, list):
            set_colors = [set_colors]

    existing_colors = set()

    if set_colors is not None:
        for color in set_colors:
            existing_colors.add(color)

    # Use the hard coded palette of 25 colors if possible
    new_color = None
    for color in __MC_COLORS:
        if color not in existing_colors:
            new_color = color
            break

    # Otherwise, just generate a random color
    if new_color is None:
        colors = analogous_color(color='0000ff', return_slices=256, split_slices=255)
        new_color = random.choice(colors)

    db.create(table='color_sets', insert_hash={
        'color_set': item_set,
        'id': item_id,
        'color': new_color,
    })

    return new_color
Exemple #13
0
def regenerate_api_key(db: DatabaseHandler, email: str) -> None:
    """Regenerate API key -- creates new non-IP limited API key, removes all IP-limited API keys."""

    email = decode_object_from_bytes_if_needed(email)

    if not email:
        raise McAuthProfileException('Email address is empty.')

    # Check if user exists
    try:
        user = user_info(db=db, email=email)
    except Exception:
        raise McAuthProfileException("User with email address '%s' does not exist." % email)

    db.begin()

    # Purge all IP-limited API keys
    db.query("""
        DELETE FROM auth_user_api_keys
        WHERE ip_address IS NOT NULL
          AND auth_users_id = (
            SELECT auth_users_id
            FROM auth_users
            WHERE email = %(email)s
          )
    """, {'email': email})

    # Regenerate non-IP limited API key
    db.query("""
        UPDATE auth_user_api_keys

        -- DEFAULT points to a generation function
        SET api_key = DEFAULT

        WHERE ip_address IS NULL
          AND auth_users_id = (
            SELECT auth_users_id
            FROM auth_users
            WHERE email = %(email)s
          )
    """, {'email': email})

    message = AuthAPIKeyResetMessage(to=email, full_name=user.full_name())
    if not send_email(message):
        db.rollback()
        raise McAuthProfileException("Unable to send email about reset API key.")

    db.commit()
Exemple #14
0
def _get_ap_dup_sentence_lengths(db: DatabaseHandler, story_text: str) -> List[int]:
    story_text = decode_object_from_bytes_if_needed(story_text)

    ap_media_id = _get_ap_media_id(db=db)

    if ap_media_id is None:
        return []

    sentences = _get_sentences_from_content(story_text=story_text)

    md5s = []
    for sentence in sentences:
        md5_hash = hashlib.md5(sentence.encode('utf-8')).hexdigest()
        md5s.append(md5_hash)

    sentence_lengths = db.query("""
        SELECT length(sentence) AS len
        FROM story_sentences
        WHERE media_id = %(ap_media_id)s

          -- FIXME this probably never worked because the index is half_md5(), not md5()
          AND md5(sentence) = ANY(%(md5s)s)
    """, {
        'ap_media_id': ap_media_id,
        'md5s': md5s,
    }).flat()

    # MC_REWRITE_TO_PYTHON: Perlism
    if sentence_lengths is None:
        sentence_lengths = []

    return sentence_lengths
Exemple #15
0
def process_download_for_extractor(db: DatabaseHandler,
                                   download: dict,
                                   extractor_args: PyExtractorArguments = PyExtractorArguments()) -> None:
    """Extract the download and create the resulting download_text entry. If there are no remaining downloads to be
    extracted for the story, call process_extracted_story() on the parent story."""

    download = decode_object_from_bytes_if_needed(download)

    stories_id = download['stories_id']

    log.debug("extract: {} {} {}".format(download['downloads_id'], stories_id, download['url']))

    extract_and_create_download_text(db=db, download=download, extractor_args=extractor_args)

    has_remaining_download = db.query("""
        SELECT downloads_id
        FROM downloads
        WHERE stories_id = %(stories_id)s
          AND extracted = 'f'
          AND type = 'content'
    """, {'stories_id': stories_id}).hash()

    # MC_REWRITE_TO_PYTHON: Perlism
    if has_remaining_download is None:
        has_remaining_download = {}

    if len(has_remaining_download) > 0:
        log.info("Pending more downloads...")

    else:
        story = db.find_by_id(table='stories', object_id=stories_id)
        process_extracted_story(db=db, story=story, extractor_args=extractor_args)
Exemple #16
0
def _get_story_with_most_sentences(db: DatabaseHandler, stories: list) -> dict:
    """Given a list of stories, return the story with the most sentences."""
    assert len(stories) > 0

    if len(stories) == 1:
        return stories[0]

    story = db.query(
        """
        select s.*
            from stories s
            where stories_id in (
                select stories_id
                    from story_sentences
                    where stories_id = any (%(a)s)
                    group by stories_id
                    order by count(*) desc
                    limit 1
            )
        """,
        {'a': [s['stories_id'] for s in stories]}).hash()

    if story is not None:
        return story
    else:
        return stories[0]
def _get_seeded_content(db: DatabaseHandler, topic_fetch_url: dict) -> typing.Optional[FetchLinkResponse]:
    """Return content for this url and topic in topic_seed_urls.

    Arguments:
    db - db handle
    topic_fetch_url - topic_fetch_url dict from db

    Returns:
    dummy response object

    """
    r = db.query(
        "select content from topic_seed_urls where topics_id = %(a)s and url = %(b)s and content is not null",
        {'a': topic_fetch_url['topics_id'], 'b': topic_fetch_url['url']}).flat()

    if len(r) == 0:
        return None

    return FetchLinkResponse(
        url=topic_fetch_url['url'],
        is_success=True,
        code=HTTPStatus.OK.value,
        message=HTTPStatus.OK.phrase,
        content=r[0],
        last_requested_url=topic_fetch_url['url'],
    )
Exemple #18
0
def password_reset_token_is_valid(db: DatabaseHandler, email: str, password_reset_token: str) -> bool:
    """Validate password reset token (used for both user activation and password reset)."""
    email = decode_object_from_bytes_if_needed(email)
    password_reset_token = decode_object_from_bytes_if_needed(password_reset_token)

    if not (email and password_reset_token):
        log.error("Email and / or password reset token is empty.")
        return False

    # Fetch readonly information about the user
    password_reset_token_hash = db.query("""
        SELECT auth_users_id,
               email,
               password_reset_token_hash
        FROM auth_users
        WHERE email = %(email)s
        LIMIT 1
    """, {'email': email}).hash()
    if password_reset_token_hash is None or 'auth_users_id' not in password_reset_token_hash:
        log.error("Unable to find user %s in the database." % email)
        return False

    password_reset_token_hash = password_reset_token_hash['password_reset_token_hash']

    if password_hash_is_valid(password_hash=password_reset_token_hash, password=password_reset_token):
        return True
    else:
        return False
Exemple #19
0
def extract_and_process_story(db: DatabaseHandler,
                              story: dict,
                              extractor_args: PyExtractorArguments = PyExtractorArguments()) -> None:
    """Extract all of the downloads for the given story and then call process_extracted_story()."""

    story = decode_object_from_bytes_if_needed(story)

    stories_id = story['stories_id']

    use_transaction = not db.in_transaction()
    if use_transaction:
        db.begin()

    log.debug("Fetching downloads for story {}...".format(stories_id))
    downloads = db.query("""
        SELECT *
        FROM downloads
        WHERE stories_id = %(stories_id)s
          AND type = 'content'
        ORDER BY downloads_id ASC
    """, {'stories_id': stories_id}).hashes()

    # MC_REWRITE_TO_PYTHON: Perlism
    if downloads is None:
        downloads = []

    for download in downloads:
        log.debug("Extracting download {} for story {}...".format(download['downloads_id'], stories_id))
        extract_and_create_download_text(db=db, download=download, extractor_args=extractor_args)

    log.debug("Processing extracted story {}...".format(stories_id))
    process_extracted_story(db=db, story=story, extractor_args=extractor_args)

    if use_transaction:
        db.commit()
Exemple #20
0
def lookup_tag(db: DatabaseHandler, tag_name: str) -> Optional[Dict[str, Any]]:
    """Lookup the tag given the tag_set:tag format."""
    tag_name = decode_object_from_bytes_if_needed(tag_name)

    if not tag_name:
        log.warning("Tag name is empty.")
        return None

    if not re.match(pattern='^([^:]*):([^:]*)$', string=tag_name):
        log.warning("Unable to parse tag name '{}'.".format(tag_name))
        return None

    tag_set_name, tag = tag_name.split(':')

    found_tag = db.query("""
        SELECT t.*
        FROM tags AS t,
             tag_sets AS ts
        WHERE t.tag_sets_id = ts.tag_sets_id
          AND t.tag = %(tag)s
          AND ts.name = %(tag_set_name)s
    """, {'tag': tag, 'tag_set_name': tag_set_name}).hash()

    # MC_REWRITE_TO_PYTHON: Perlism
    if found_tag is None:
        found_tag = {}

    return found_tag
Exemple #21
0
def _create_child_download_for_story(db: DatabaseHandler, story: dict, parent_download: dict) -> None:
    """Create a pending download for the story's URL."""
    story = decode_object_from_bytes_if_needed(story)
    parent_download = decode_object_from_bytes_if_needed(parent_download)

    download = {
        'feeds_id': parent_download['feeds_id'],
        'stories_id': story['stories_id'],
        'parent': parent_download['downloads_id'],
        'url': story['url'],
        'host': get_url_host(story['url']),
        'type': 'content',
        'sequence': 1,
        'state': 'pending',
        'priority': parent_download['priority'],
        'extracted': False,
    }

    content_delay = db.query("""
        SELECT content_delay
        FROM media
        WHERE media_id = %(media_id)s
    """, {'media_id': story['media_id']}).flat()[0]
    if content_delay:
        # Delay download of content this many hours. his is useful for sources that are likely to significantly change
        # content in the hours after it is first published.
        now = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
        download_at_timestamp = now + (content_delay * 60 * 60)
        download['download_time'] = get_sql_date_from_epoch(download_at_timestamp)

    db.create(table='downloads', insert_hash=download)
def extract_links_for_topic_story(db: DatabaseHandler, story: dict, topic: dict) -> None:
    """
    Extract links from a story and insert them into the topic_links table for the given topic.

    After the story is processed, set topic_stories.spidered to true for that story.  Calls get_links_from_story
    on each story.

    Almost all errors are caught by this function saved in topic_stories.link_mine_error.  In the case of an error
    topic_stories.link_mined is also set to true.

    Arguments:
    db - db handle
    story - story dict from db
    topic - topic dict from db

    Returns:
    None

    """
    try:
        log.info("mining %s %s for topic %s .." % (story['title'], story['url'], topic['name']))
        links = get_links_from_story(db, story)

        for link in links:
            if mediawords.tm.domains.skip_self_linked_domain_url(db, topic['topics_id'], story['url'], link):
                log.info("skipping self linked domain url...")
                continue

            topic_link = {
                'topics_id': topic['topics_id'],
                'stories_id': story['stories_id'],
                'url': link
            }

            db.create('topic_links', topic_link)
            mediawords.tm.domains.increment_domain_links(db, topic_link)

        link_mine_error = ''
    except Exception:
        link_mine_error = traceback.format_exc()

    db.query(
        """
        update topic_stories set link_mined = 't', link_mine_error = %(c)s
            where stories_id = %(a)s and topics_id = %(b)s
        """,
        {'a': story['stories_id'], 'b': topic['topics_id'], 'c': link_mine_error})
Exemple #23
0
def assign_date_guess_tag(
        db: DatabaseHandler,
        story: dict,
        date_guess: GuessDateResult,
        fallback_date: typing.Optional[str]) -> None:
    """Assign a guess method tag to the story based on the date_guess result.

    If date_guess found a result, assign a date_guess_method:guess_by_url, guess_by_tag_*, or guess_by_uknown tag.
    Otherwise if there is a fallback_date, assign date_guess_metehod:fallback_date.  Else assign
    date_invalid:date_invalid.

    Arguments:
    db - db handle
    story - story dict from db
    date_guess - GuessDateResult from guess_date() call

    Returns:
    None

    """
    if date_guess.found:
        tag_set = mediawords.tm.guess_date.GUESS_METHOD_TAG_SET
        guess_method = date_guess.guess_method
        if guess_method.startswith('Extracted from url'):
            tag = 'guess_by_url'
        elif guess_method.startswith('Extracted from tag'):
            match = re2.search(r'\<(\w+)', guess_method)
            html_tag = match.group(1) if match is not None else 'unknown'
            tag = 'guess_by_tag_' + str(html_tag)
        else:
            tag = 'guess_by_unknown'
    elif fallback_date is not None:
        tag_set = mediawords.tm.guess_date.GUESS_METHOD_TAG_SET
        tag = 'fallback_date'
    else:
        tag_set = mediawords.tm.guess_date.INVALID_TAG_SET
        tag = mediawords.tm.guess_date.INVALID_TAG

    ts = db.find_or_create('tag_sets', {'name': tag_set})
    t = db.find_or_create('tags', {'tag': tag, 'tag_sets_id': ts['tag_sets_id']})

    db.query("delete from stories_tags_map where stories_id = %(a)s", {'a': story['stories_id']})
    db.query(
        "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)",
        {'a': story['stories_id'], 'b': t['tags_id']})
Exemple #24
0
def get_media_id(db: DatabaseHandler, download: dict) -> int:
    """Convenience method to get the media_id for the download."""
    download = decode_object_from_bytes_if_needed(download)

    return db.query("""
        SELECT media_id
        FROM feeds
        WHERE feeds_id = %(feeds_id)s
    """, {'feeds_id': download['feeds_id']}).hash()['media_id']
Exemple #25
0
def get_unique_medium_name(db: DatabaseHandler, names: list) -> str:
    """Return the first name in the names list that does not yet exist for a media source, or None."""
    for name in names:
        name = name[0:MAX_NAME_LENGTH]
        name_exists = db.query("select 1 from media where lower(name) = lower(%(a)s)", {'a': name}).hash()
        if name_exists is None:
            return name

    raise McTopicMediaUniqueException("Unable to find unique name among names: " + str(names))
Exemple #26
0
def is_new(db: DatabaseHandler, story: dict) -> bool:
    """Return true if this story should be considered new for the given media source.

    This is used to determine whether to add a new story for a feed item URL.

    A story is new if no story with the same URL or GUID exists in the same media source and if no story exists with the
    same title in the same media source in the same calendar day.
    """

    story = decode_object_from_bytes_if_needed(story)

    if story['title'] == '(no title)':
        return False

    db_story = db.query("""
        SELECT *
        FROM stories
        WHERE guid = %(guid)s
          AND media_id = %(media_id)s
    """, {
        'guid': story['guid'],
        'media_id': story['media_id'],
    }).hash()
    if db_story:
        return False

    db_story = db.query("""
        SELECT 1
        FROM stories
        WHERE md5(title) = md5(%(title)s)
          AND media_id = %(media_id)s

          -- We do the goofy " + interval '1 second'" to force postgres to use the stories_title_hash index
          AND date_trunc('day', publish_date)  + interval '1 second'
            = date_trunc('day', %(publish_date)s::date) + interval '1 second'
    """, {
        'title': story['title'],
        'media_id': story['media_id'],
        'publish_date': story['publish_date'],
    }).hash()
    if db_story:
        return False

    return True
Exemple #27
0
def __get_merged_stories_ids(db: DatabaseHandler, stories_ids: List[int], n: int = 0) -> List[int]:
    """Gor a given set of stories, get all the stories that are source or target merged stories in
    topic_merged_stories_map. Repeat recursively up to 10 times, or until no new stories are found."""

    stories_ids = decode_object_from_bytes_if_needed(stories_ids)

    # "The crazy load was from a query to our topic_merged_stories_ids to get
    # url variants.  It looks like we have some case of many, many merged story
    # pairs that are causing that query to make postgres sit on a cpu for a
    # super long time.  There's no good reason to query for ridiculous numbers
    # of merged stories, so I just arbitrarily capped the number of merged story
    # pairs to 20 to prevent this query from running away in the future."
    max_stories = 20

    # MC_REWRITE_TO_PYTHON: cast strings to ints
    # noinspection PyTypeChecker
    stories_ids = [int(x) for x in stories_ids]

    if len(stories_ids) == 0:
        return []

    if len(stories_ids) >= max_stories:
        return stories_ids[0:max_stories - 1]

    # MC_REWRITE_TO_PYTHON: change to tuple parameter because Perl database handler proxy can't handle tuples
    stories_ids_list = ', '.join(str(x) for x in stories_ids)

    merged_stories_ids = db.query("""
        SELECT DISTINCT
            target_stories_id,
            source_stories_id
        FROM topic_merged_stories_map
        WHERE target_stories_id IN (%(stories_ids_list)s)
          OR source_stories_id IN (%(stories_ids_list)s)
        LIMIT %(max_stories)s
    """ % {
        'stories_ids_list': stories_ids_list,
        'max_stories': int(max_stories),
    }).flat()

    # MC_REWRITE_TO_PYTHON: Perl database handler proxy (the dreaded "wantarray" part) returns None on empty result
    # sets, a scalar on a single item and arrayref on many items
    if merged_stories_ids is None:
        merged_stories_ids = []
    elif isinstance(merged_stories_ids, int):
        merged_stories_ids = [merged_stories_ids]

    merged_stories_ids = [int(x) for x in merged_stories_ids]

    all_stories_ids = list(set(stories_ids + merged_stories_ids))

    if n > 10 or len(stories_ids) == len(all_stories_ids) or len(stories_ids) >= max_stories:
        return all_stories_ids

    else:
        return __get_merged_stories_ids(db=db, stories_ids=all_stories_ids, n=n + 1)
Exemple #28
0
def add_to_topic_stories(
        db: DatabaseHandler,
        story: dict,
        topic: dict,
        link_mined: bool = False,
        valid_foreign_rss_story: bool = False,
        iteration: int = None) -> None:
    """Add story to topic_stories table.

    Query topic_stories and topic_links to find the linking story with the smallest iteration and use
    that iteration + 1 for the new topic_stories row.
    """
    if iteration is None:
        source_story = db.query(
            """
            select ts.*
                from topic_stories ts
                    join topic_links tl on ( ts.stories_id = tl.stories_id and ts.topics_id = tl.topics_id )
                where
                    tl.ref_stories_id = %(a)s and
                    tl.topics_id = %(b)s
                order by ts.iteration asc
                limit 1
            """,
            {'a': story['stories_id'], 'b': topic['topics_id']}).hash()

        iteration = (source_story['iteration'] + 1) if source_story else 0

    db.query(
        """
        insert into topic_stories
            ( topics_id, stories_id, iteration, redirect_url, link_mined, valid_foreign_rss_story )
            values ( %(a)s, %(b)s, %(c)s, %(d)s, %(e)s, %(f)s )
            on conflict do nothing
        """,
        {
            'a': topic['topics_id'],
            'b': story['stories_id'],
            'c': iteration,
            'd': story['url'],
            'e': link_mined,
            'f': valid_foreign_rss_story
        })
Exemple #29
0
def delete_user(db: DatabaseHandler, email: str) -> None:
    """Delete user."""

    email = decode_object_from_bytes_if_needed(email)

    if not email:
        raise McAuthProfileException('Email address is empty.')

    # Check if user exists
    try:
        user_info(db=db, email=email)
    except Exception:
        raise McAuthProfileException("User with email address '%s' does not exist." % email)

    # Delete the user (PostgreSQL's relation will take care of 'auth_users_roles_map')
    db.query("""
        DELETE FROM auth_users
        WHERE email = %(email)s
    """, {'email': email})
def _is_not_topic_story(db: DatabaseHandler, topic_fetch_url: dict) -> bool:
    """Return True if the story is not in topic_stories for the given topic."""
    if 'stories_id' not in topic_fetch_url:
        return True

    ts = db.query(
        "select * from topic_stories where stories_id = %(a)s and topics_id = %(b)s",
        {'a': topic_fetch_url['stories_id'], 'b': topic_fetch_url['topics_id']}).hash()

    return ts is None
def __count_null_title_stories(db: DatabaseHandler, topic: dict) -> int:
    """Count the stories in the topic with a null normalized_title_hash."""
    null_count = db.query(
        """
        select count(*)
            from stories s
                join topic_stories ts using ( stories_id )
            where
                ts.topics_id = %(a)s and
                s.normalized_title_hash is null
        """, {
            'a': topic['topics_id']
        }).flat()[0]

    return null_count
    def __try_storing_object_in_cache(self, db: DatabaseHandler, object_id: int, content: bytes) -> None:
        """Attempt to store object to cache, don't worry too much if it fails."""

        object_id = self._prepare_object_id(object_id)

        if content is None:
            raise McCachedAmazonS3StoreException("Content to store is None for object ID %d." % object_id)

        if isinstance(content, str):
            content = content.encode('utf-8')

        try:
            content = self._compress_data_for_method(data=content, compression_method=self.__cache_compression_method)

            sql = "INSERT INTO %s " % self.__cache_table  # interpolated by Python
            sql += "(object_id, raw_data) "
            sql += "VALUES (%(object_id)s, %(raw_data)s) "  # interpolated by psycopg2
            sql += "ON CONFLICT (object_id) DO UPDATE "
            sql += "    SET raw_data = EXCLUDED.raw_data"

            db.query(sql, {'object_id': object_id, 'raw_data': content})

        except Exception as ex:
            log.warning("Unable to cache object ID %d: %s" % (object_id, str(ex),))
Exemple #33
0
def _id_exists_in_db(db: DatabaseHandler, guid: str) -> bool:
    """Internal method to check if item exists in the database."""
    guid_exists = db.query(
        "select 1 from stories s join media m using (media_id) where m.name = %(b)s and s.guid = %(a)s",
        {
            'a': guid,
            'b': AP_MEDIUM_NAME
        }).hash()

    if guid_exists:
        log.debug(
            'Story with guid: {} is already in the database -- skipping story.'
        )
        return True
    return False
Exemple #34
0
    def content_exists(self, db: DatabaseHandler, object_id: int, object_path: str = None) -> bool:
        """Test if object exists in PostgreSQL table."""

        object_id = self._prepare_object_id(object_id)

        sql = "SELECT 1"
        sql += "FROM %s " % self.__table  # interpolated by Python
        sql += "WHERE object_id = %(object_id)s"  # interpolated by psycopg2

        object_exists = db.query(sql, {'object_id': object_id}).hash()

        if object_exists is not None and len(object_exists) > 0:
            return True
        else:
            return False
Exemple #35
0
def default_weekly_requested_items_limit(db: DatabaseHandler) -> int:
    """Get default weekly requested items limit."""

    limit = db.query("""
        SELECT column_default AS default_weekly_requested_items_limit
        FROM information_schema.columns
        WHERE (table_schema, table_name) = ('public', 'auth_user_limits')
          AND column_name = 'weekly_requested_items_limit'
    """).flat()

    if not limit:
        raise McAuthLimitsException(
            "Unable to fetch default weekly requested items limit.")

    return limit[0]
Exemple #36
0
def _delete_story_sentences(db: DatabaseHandler, story: dict) -> None:
    """Delete any existing stories for the given story and also update media_stats to adjust for the deletion."""
    story = decode_object_from_bytes_if_needed(story)

    num_deleted = db.query(
        """
        DELETE FROM story_sentences
        WHERE stories_id = %(stories_id)s
    """, {
            'stories_id': story['stories_id']
        }).rows()

    if num_deleted > 0:
        db.query(
            """
            UPDATE media_stats
            SET num_sentences = num_sentences - %(num_deleted)s
            WHERE media_id = %(media_id)s
              AND stat_date = %(publish_date)s::date
        """, {
                'num_deleted': num_deleted,
                'media_id': story['media_id'],
                'publish_date': story['publish_date'],
            })
def _validate_topic_posts(db: DatabaseHandler, topic: dict, mock_posts: list) -> None:
    """Validate that the topic_posts match the mock_posts."""
    got_posts = db.query(
        """
        select *
            from topic_posts tp
                join topic_post_days tpd using ( topic_post_days_id )
            where topics_id = %(a)s
        """,
        {'a': topic['topics_id']}).hashes()

    assert len(got_posts) == len(mock_posts)

    mock_posts = sorted(mock_posts, key=lambda x: x['post_id'])

    for i, mock_post in enumerate(mock_posts):
        got_post = db.query(
            "select * from topic_posts where post_id = %(a)s::text",
            {'a': mock_post['post_id']}).hash()

        assert got_post

        for field in POST_FIELDS:
            assert str(got_post.get(field, None)) == str(mock_post.get(field, None))
Exemple #38
0
def default_max_topic_stories_limit(db: DatabaseHandler) -> int:
    """Get default max. topic stories limit."""

    # noinspection SqlResolve
    limit = db.query("""
        SELECT column_default AS default_weekly_requested_items_limit
        FROM information_schema.columns
        WHERE (table_schema, table_name) = ('public', 'auth_user_limits')
          AND column_name = 'max_topic_stories'
    """).flat()

    if not limit:
        raise McAuthLimitsException("Unable to fetch default max. topic stories limit.")

    return limit[0]
Exemple #39
0
def _get_extracted_text(db: DatabaseHandler, story: dict) -> str:
    """Return the concatenated download_texts associated with the story."""

    story = decode_object_from_bytes_if_needed(story)

    download_texts = db.query("""
        SELECT dt.download_text
        FROM downloads AS d,
             download_texts AS dt
        WHERE dt.downloads_id = d.downloads_id
          AND d.stories_id = %(stories_id)s
        ORDER BY d.downloads_id
    """, {'stories_id': story['stories_id']}).flat()

    return ".\n\n".join(download_texts)
Exemple #40
0
def _add_topic_tweet_single_day(
        db: DatabaseHandler,
        topic: dict,
        day: datetime.datetime,
        ch_class: typing.Type[AbstractCrimsonHexagon]) -> dict:
    """
    Add a row to topic_tweet_day if it does not already exist.  fetch data for new row from CH.

    Arguments:
    db - database handle
    topic - topic dict
    day - date to fetch eg '2017-12-30'
    ch_class - AbstractCrimsonHexagon class

    Return:
    None
    """
    # the perl-python layer was segfaulting until I added the str() around day below -hal
    topic_tweet_day = db.query(
        "select * from topic_tweet_days where topics_id = %(a)s and day = %(b)s",
        {'a': topic['topics_id'], 'b': str(day)}).hash()

    if topic_tweet_day is not None and topic_tweet_day['tweets_fetched']:
        raise McFetchTopicTweetDateFetchedException("tweets already fetched for day " + str(day))

    # if we have a ttd but had not finished fetching tweets, delete it and start over
    if topic_tweet_day is not None:
        db.delete_by_id('topic_tweet_days', topic_tweet_day['topic_tweet_days_id'])

    ch_posts = ch_class.fetch_posts(topic['ch_monitor_id'], day)

    tweet_count = ch_posts['totalPostsAvailable']

    num_ch_tweets = len(ch_posts['posts'])

    topic_tweet_day = db.create(
        'topic_tweet_days',
        {
            'topics_id': topic['topics_id'],
            'day': day,
            'tweet_count': tweet_count,
            'num_ch_tweets': num_ch_tweets,
            'tweets_fetched': False
        })

    topic_tweet_day['ch_posts'] = ch_posts

    return topic_tweet_day
Exemple #41
0
def _store_tweet_and_urls(db: DatabaseHandler, topic_tweet_day: dict,
                          meta_tweet: dict) -> None:
    """
    Store the tweet in topic_tweets and its urls in topic_tweet_urls, using the data in meta_tweet.

    Arguments:
    db - database handler
    topic - topic dict
    topic_tweet_day - topic_tweet_day dict
    meta_tweet - meta_tweet dict

    Return:
    None
    """
    log.debug("remove nulls")
    _remove_json_tree_nulls(meta_tweet)

    log.debug("encode json")
    data_json = mediawords.util.parse_json.encode_json(meta_tweet)

    # null characters are not legal in json but for some reason get stuck in these tweets
    # data_json = data_json.replace('\x00', '')

    topic_tweet = {
        'topic_tweet_days_id': topic_tweet_day['topic_tweet_days_id'],
        'data': data_json,
        'content': meta_tweet['tweet']['text'],
        'tweet_id': meta_tweet['tweet_id'],
        'publish_date': meta_tweet['tweet']['created_at'],
        'twitter_user': meta_tweet['tweet']['user']['screen_name']
    }

    log.debug("insert topic tweet")
    topic_tweet = db.query(
        """
        insert into topic_tweets
            ( topic_tweet_days_id, data, content, tweet_id, publish_date, twitter_user )
            values
            ( %(topic_tweet_days_id)s, %(data)s, %(content)s, %(tweet_id)s, %(publish_date)s, %(twitter_user)s )
            returning *
        """, topic_tweet).hash()

    log.debug("get tweet urls")
    urls = mediawords.util.twitter.get_tweet_urls(meta_tweet['tweet'])
    log.debug("insert tweet urls")
    _insert_tweet_urls(db, topic_tweet, urls)

    log.debug("done")
Exemple #42
0
def get_extracted_html(db: DatabaseHandler, story: dict) -> str:
    """Get the extracted html for the story.

    We don't store the extracted html of a story, so we have to get the first download assoicated with the story
    and run the extractor on it.

    """
    download = db.query(
        "select * from downloads where stories_id = %(a)s order by downloads_id limit 1",
        {
            'a': story['stories_id']
        }).hash()

    extractor_results = mediawords.dbi.downloads.extract(
        db, download, PyExtractorArguments(use_cache=True))
    return extractor_results['extracted_html']
Exemple #43
0
def create_test_story_stack_for_indexing(db: DatabaseHandler,
                                         data: dict) -> dict:
    data = decode_object_from_bytes_if_needed(data)

    story_stack = create_test_story_stack(db=db, data=data)

    media = add_content_to_test_story_stack(db=db, story_stack=story_stack)

    test_stories = db.query(
        "SELECT * FROM stories ORDER BY md5(stories_id::text)").hashes()

    # Add ancillary data so that it can be queried in Solr
    _add_story_tags_to_stories(db=db, stories=test_stories)
    _add_timespans_to_stories(db=db, stories=test_stories)

    return media
Exemple #44
0
def get_spider_feed(db: DatabaseHandler, medium: dict) -> dict:
    """Find or create the 'Spider Feed' feed for the media source."""

    feed = db.query(
        "select * from feeds where media_id = %(a)s and name = %(b)s",
        {'a': medium['media_id'], 'b': SPIDER_FEED_NAME}).hash()

    if feed is not None:
        return feed

    return db.find_or_create('feeds', {
        'media_id': medium['media_id'],
        'url': medium['url'] + '#spiderfeed',
        'name': SPIDER_FEED_NAME,
        'feed_status': 'inactive'
    })
def _get_youtube_embed_links(db: DatabaseHandler, story: dict) -> List[str]:
    """Parse youtube embedded video urls out of the full html of the story.

    This function looks for youtube embed links anywhere in the html of the story content, rather than just in the
    extracted html.  It aims to return a superset of all youtube embed links by returning every iframe src= attribute
    that includes the string 'youtube'.

    Arguments:
    db - db handle
    story - story dict from db

    Returns:
    list of string urls

    """
    download = db.query(
        """
        SELECT *
        FROM downloads
        WHERE stories_id = %(stories_id)s
        ORDER BY stories_id
        LIMIT 1
    """, {
            'stories_id': story['stories_id'],
        }).hash()

    html = fetch_content(db, download)

    soup = BeautifulSoup(html, 'lxml')

    links = []
    for tag in soup.find_all('iframe', src=True):
        url = tag['src']

        if 'youtube' not in url:
            continue

        if not url.lower().startswith('http'):
            url = 'http:' + url

        url = url.strip()

        url = url.replace('youtube-embed', 'youtube')

        links.append(url)

    return links
Exemple #46
0
def _add_tag_to_snapshot_media(
        db: DatabaseHandler,
        topics_id: int,
        timespans_id: int,
        media: List[Dict[str, Any]],
        tag_set_name: str,
        field_name: str) -> None:
    """Add field to media based on tag association from the given tag set.
    
    Use snapshotted data from snap.media_tags_map.
    """
    partisan_tags = db.query(
        """
            WITH timespan_tags AS (
                SELECT
                    snap.media_tags_map.*
                FROM snap.media_tags_map
                    INNER JOIN timespans ON
                        snap.media_tags_map.topics_id = timespans.topics_id AND
                        snap.media_tags_map.snapshots_id = timespans.snapshots_id
                WHERE 
                    timespans.topics_id = %(topics_id)s AND
                    timespans.timespans_id = %(timespans_id)s
            )

            SELECT
                timespan_tags.*,
                tags.tag
            FROM timespan_tags
                INNER JOIN tags ON
                    timespan_tags.tags_id = tags.tags_id
                INNER JOIN tag_sets ON
                    tag_sets.tag_sets_id = tags.tag_sets_id
            WHERE 
                tag_sets.name = %(tag_set_name)s
        """,
        {
            'topics_id': topics_id,
            'timespans_id': timespans_id,
            'tag_set_name': tag_set_name,
        }
    ).hashes()

    partisan_map = {pt['media_id']: pt['tag'] for pt in partisan_tags}

    for medium in media:
        medium[field_name] = partisan_map.get(medium['media_id'], 'null')
Exemple #47
0
def _store_post_and_urls(db: DatabaseHandler, topic_post_day: dict,
                         post: dict) -> None:
    """
    Store the tweet in topic_posts and its urls in topic_post_urls, using the data in post.

    Arguments:
    db - database handler
    topic - topic dict
    topic_post_day - topic_post_day dict
    post - post dict

    Return:
    None
    """
    log.debug("remove nulls")
    _remove_json_tree_nulls(post)

    log.debug("encode json")
    data_json = encode_json(post)

    # null characters are not legal in json but for some reason get stuck in these tweets
    # data_json = data_json.replace('\x00', '')

    data = {}

    for field in POST_FIELDS:
        data[field] = post.get(field, None)

    data['topic_post_days_id'] = topic_post_day['topic_post_days_id']
    data['data'] = data_json

    topic_post = db.query(
        "select * from topic_posts where topic_post_days_id = %(a)s and post_id = %(b)s::text",
        {
            'a': topic_post_day['topic_post_days_id'],
            'b': data['post_id']
        }).hash()

    if not topic_post:
        log.debug("insert topic post")
        topic_post = db.create('topic_posts', data)

    log.debug("insert tweet urls")
    _insert_post_urls(db, topic_post, post['urls'])

    log.debug("done")
Exemple #48
0
def default_role_ids(db: DatabaseHandler) -> List[int]:
    """List of role IDs to apply to new users."""
    default_roles = db.query("""
        SELECT auth_roles_id
        FROM auth_roles
        WHERE role = %(role)s
    """, {'role': UserRoles.tm_readonly()}).flat()
    if (not default_roles) or (not len(default_roles)):
        raise McRoleIDForRoleException('Unable to find default role IDs.')
    if default_roles is None:
        default_roles = []

    # MC_REWRITE_TO_PYTHON: if only a single item is to be returned, Perl doesn't bother to make it into a list
    if isinstance(default_roles, int):
        default_roles = [default_roles]

    return default_roles
Exemple #49
0
def extract_and_create_download_text(
        db: DatabaseHandler, download: dict,
        extractor_args: PyExtractorArguments) -> dict:
    """Extract the download and create a download_text from the extracted download."""
    download = decode_object_from_bytes_if_needed(download)

    downloads_id = download['downloads_id']

    log.debug("Extracting download {}...".format(downloads_id))
    extraction_result = extract(db=db,
                                download=download,
                                extractor_args=extractor_args)
    log.debug("Done extracting download {}.".format(downloads_id))

    download_text = None
    if extractor_args.use_existing():
        log.debug(
            "Fetching download text for download {}...".format(downloads_id))
        download_text = db.query(
            """
            SELECT *
            FROM download_texts
            WHERE downloads_id = %(downloads_id)s
        """, {
                'downloads_id': downloads_id
            }).hash()

    if download_text is None:
        log.debug(
            "Creating download text for download {}...".format(downloads_id))
        download_text = create(db=db,
                               download=download,
                               extract=extraction_result)

    assert 'stories_id' in download, "It is expected to have 'stories_id' set for a download at this point."

    if not extractor_args.no_tag_extractor_version():
        log.debug("Updating extractor version tag for story {}...".format(
            download['stories_id']))
        update_extractor_version_tag(
            db=db,
            stories_id=download['stories_id'],
            extractor_version=extraction_result['extractor_version'],
        )

    return download_text
Exemple #50
0
def _get_merged_iteration(db: DatabaseHandler, topic: dict, delete_story: dict, keep_story: dict) -> int:
    """Get the smaller iteration of two stories"""
    iterations = db.query(
        """
        select iteration
            from topic_stories
            where
                topics_id = %(a)s and
                stories_id in (%(b)s, %(c)s) and
                iteration is not null
        """,
        {'a': topic['topics_id'], 'b': delete_story['stories_id'], 'c': keep_story['stories_id']}).flat()

    if len(iterations) > 0:
        return min(iterations)
    else:
        return 0
Exemple #51
0
def _get_topic_stories_by_medium(db: DatabaseHandler, topic: dict) -> dict:
    """Return hash of { $media_id: stories } for the topic."""

    stories = db.query(
        """
        select s.stories_id, s.media_id, s.title, s.url, s.publish_date
            from snap.live_stories s
            where s.topics_id = %(a)s
        """,
        {'a': topic['topics_id']}).hashes()

    media_lookup = {}
    for s in stories:
        media_lookup.setdefault(s['media_id'], [])
        media_lookup[s['media_id']].append(s)

    return media_lookup
    def __try_retrieving_object_from_cache(self, db: DatabaseHandler, object_id: int) -> Union[bytes, None]:
        """Attempt to retrieve object from cache, don't worry too much if it fails."""

        object_id = self._prepare_object_id(object_id)

        try:
            sql = "SELECT raw_data "
            sql += "FROM %s " % self.__cache_table  # interpolated by Python
            sql += "WHERE object_id = %(object_id)s"  # interpolated by psycopg2

            content = db.query(sql, {'object_id': object_id}).hash()

            if content is None or len(content) == 0:
                raise McCachedAmazonS3StoreException("Object with ID %d was not found." % object_id)

            content = content['raw_data']

            # MC_REWRITE_TO_PYTHON: Perl database handler returns value as array of bytes
            if isinstance(content, list):
                content = b''.join(content)

            if isinstance(content, memoryview):
                content = content.tobytes()

            if not isinstance(content, bytes):
                raise McCachedAmazonS3StoreException("Content is not bytes for object %d." % object_id)

            try:
                content = self._uncompress_data_for_method(data=content,
                                                           compression_method=self.__cache_compression_method)
            except Exception as ex:
                raise McCachedAmazonS3StoreException(
                    "Unable to uncompress data for object ID %d: %s" % (object_id, str(ex),))

            if content is None:
                raise McCachedAmazonS3StoreException("Content is None after uncompression for object ID %d" % object_id)
            if not isinstance(content, bytes):
                raise McCachedAmazonS3StoreException(
                    "Content is not bytes after uncompression for object ID %d" % object_id)

        except Exception as ex:
            log.debug("Unable to retrieve object ID %d from cache: %s" % (object_id, str(ex),))
            return None

        else:
            return content
Exemple #53
0
def _get_cached_extractor_results(db: DatabaseHandler,
                                  download: dict) -> typing.Optional[dict]:
    """Get extractor results from cache.

    Return:
    None if there is a miss or a dict in the form of extract_content() if there is a hit.
    """
    r = db.query(
        "select extracted_html, extracted_text from cached_extractor_results where downloads_id = %(a)s",
        {
            'a': download['downloads_id']
        }).hash()

    log.debug(
        "EXTRACTOR CACHE HIT" if r is not None else "EXTRACTOR CACHE MISS")

    return r
Exemple #54
0
def _get_extractor_results_cache(db: DatabaseHandler, download: dict) -> Optional[dict]:
    """Get extractor results from cache.

    Return:
    None if there is a miss or a dict in the form of extract_content() if there is a hit.
    """
    download = decode_object_from_bytes_if_needed(download)

    r = db.query("""
        SELECT extracted_html, extracted_text
        FROM cache.extractor_results_cache
        WHERE downloads_id = %(a)s
    """, {'a': download['downloads_id']}).hash()

    log.debug("EXTRACTOR CACHE HIT" if r is not None else "EXTRACTOR CACHE MISS")

    return r
Exemple #55
0
def _get_dup_story_groups(db: DatabaseHandler, topic: dict) -> list:
    """Return a list of duplicate story groups.

    Find all stories within a topic that have duplicate normalized titles with a given day and media_id.  Return a
    list of story lists.  Each story list is a list of stories that are duplicated os each other.
    """
    story_pairs = db.query(
        """
            SELECT
                a.stories_id AS stories_id_a,
                b.stories_id AS stories_id_b
            FROM
                snap.live_stories AS a,
                snap.live_stories AS b
            WHERE
                a.topics_id = %(topics_id)s AND
                a.topics_id = b.topics_id AND
                a.stories_id < b.stories_id AND
                a.media_id = b.media_id AND
                a.normalized_title_hash = b.normalized_title_hash AND
                date_trunc('day', a.publish_date) = date_trunc('day', b.publish_date)
            ORDER BY
                stories_id_a,
                stories_id_b
        """,
        {
            'topics_id': topic['topics_id'],
        }
    ).hashes()

    story_groups = {}
    ignore_stories = {}
    for story_pair in story_pairs:
        if story_pair['stories_id_b'] in ignore_stories:
            continue

        story_a = db.require_by_id('stories', story_pair['stories_id_a'])
        story_b = db.require_by_id('stories', story_pair['stories_id_b'])

        story_groups.setdefault(story_a['stories_id'], [story_a])
        story_groups[story_a['stories_id']].append(story_b)

        ignore_stories[story_b['stories_id']] = True

    return list(story_groups.values())
Exemple #56
0
def _update_media_normalized_urls(db: DatabaseHandler) -> None:
    """Keep normalized_url field in media table up to date.

    Set the normalized_url field of any row in media for which it is null.  Take care to lock the process
    so that only one process is doing this work at a time.
    """
    # put a lock on this because the process of generating all media urls will take a couple hours, and we don't
    # want all workers to do the work
    locked = False
    while not locked:
        if not _normalized_urls_out_of_date(db):
            return

        db.begin()

        # poll instead of block so that we can releae the transaction and see whether someone else has already
        # updated all of the media
        locked = get_session_lock(
            db, 'MediaWords::TM::Media::media_normalized_urls', 1, wait=False)

        if not locked:
            db.commit()
            log.info("sleeping for media_normalized_urls lock...")
            time.sleep(1)

    log.warning("updating media_normalized_urls ...")

    media = db.query(
        "select * from media where normalized_url is null").hashes()

    i = 0
    total = len(media)
    for medium in media:
        i += 1
        normalized_url = mediawords.util.url.normalize_url_lossy(medium['url'])
        if normalized_url is None:
            normalized_url = medium['url']

        log.info("[%d/%d] adding %s (%s)" %
                 (i, total, medium['name'], normalized_url))

        db.update_by_id('media', medium['media_id'],
                        {'normalized_url': normalized_url})

    db.commit()
Exemple #57
0
def all_user_roles(db: DatabaseHandler) -> List[dict]:
    """Fetch a list of available user roles."""
    # MC_REWRITE_TO_PYTHON: could return a set after Python rewrite
    roles = db.query("""
        SELECT auth_roles_id,
               role,
               description
        FROM auth_roles
        ORDER BY auth_roles_id
    """).hashes()
    if roles is None:
        roles = []

    # MC_REWRITE_TO_PYTHON: if only a single item is to be returned, Perl doesn't bother to make it into a list
    if isinstance(roles, dict):
        roles = [roles]

    return roles
Exemple #58
0
def get_unique_medium_url(db: DatabaseHandler, urls: list) -> str:
    """Return the first url in the list that does not yet exist for a media source, or None.

    If no unique urls are found, trying appending '#spider' to each of the urls.
    """
    spidered_urls = [u + URL_SPIDERED_SUFFIX for u in urls]
    urls = urls + spidered_urls

    for url in urls:
        url = url[0:MAX_URL_LENGTH]
        url_exists = db.query("select 1 from media where url = %(a)s", {
            'a': url
        }).hash()
        if url_exists is None:
            return url

    raise McTopicMediaUniqueException(
        "Unable to find unique url among urls: " + str(urls))
Exemple #59
0
def __story_extractor_tags(db: DatabaseHandler,
                           stories_id: int) -> List[Dict[str, Any]]:
    return db.query(
        """
        SELECT stories_tags_map.stories_id,
               tags.tag AS tags_name,
               tag_sets.name AS tag_sets_name
        FROM stories_tags_map
            INNER JOIN tags
                ON stories_tags_map.tags_id = tags.tags_id
            INNER JOIN tag_sets
                ON tags.tag_sets_id = tag_sets.tag_sets_id
        WHERE stories_tags_map.stories_id = %(stories_id)s
          AND tag_sets.name = %(tag_sets_name)s
    """, {
            'stories_id': stories_id,
            'tag_sets_name': extractor_version_tag_sets_name()
        }).hashes()
Exemple #60
0
def ignore_redirect(db: DatabaseHandler, url: str,
                    redirect_url: Optional[str]) -> bool:
    """Return true if we should ignore redirects to the target media source.

    This is usually to avoid redirects to domain resellers for previously valid and important but now dead links."""
    if redirect_url is None or url == redirect_url:
        return False

    medium_url = generate_medium_url_and_name_from_url(redirect_url)[0]

    u = normalize_url_lossy(medium_url)

    match = db.query("select 1 from topic_ignore_redirects where url = %(a)s",
                     {
                         'a': u
                     }).hash()

    return match is not None