Ejemplo n.º 1
0
def assign_date_guess_tag(
        db: DatabaseHandler,
        story: dict,
        date_guess: GuessDateResult,
        fallback_date: Optional[str]) -> None:
    """Assign a guess method tag to the story based on the date_guess result.

    If date_guess found a result, assign a date_guess_method:guess_by_url, guess_by_tag_*, or guess_by_unknown tag.
    Otherwise if there is a fallback_date, assign date_guess_metehod:fallback_date.  Else assign
    date_invalid:date_invalid.

    Arguments:
    db - db handle
    story - story dict from db
    date_guess - GuessDateResult from guess_date() call

    Returns:
    None

    """
    if date_guess.found:
        tag_set = GUESS_METHOD_TAG_SET
        guess_method = date_guess.guess_method
        if guess_method.startswith('Extracted from url'):
            tag = 'guess_by_url'
        elif guess_method.startswith('Extracted from tag'):
            match = re2.search(r'\<(\w+)', guess_method)
            html_tag = match.group(1) if match is not None else 'unknown'
            tag = 'guess_by_tag_' + str(html_tag)
        else:
            tag = 'guess_by_unknown'
    elif fallback_date is not None:
        tag_set = GUESS_METHOD_TAG_SET
        tag = 'fallback_date'
    else:
        tag_set = INVALID_TAG_SET
        tag = INVALID_TAG

    ts = db.find_or_create('tag_sets', {'name': tag_set})
    t = db.find_or_create('tags', {'tag': tag, 'tag_sets_id': ts['tag_sets_id']})

    stories_id = story['stories_id']
    tags_id = t['tags_id']

    db.query(
        """
        DELETE FROM stories_tags_map
        WHERE stories_id = %(stories_id)s
        """,
        {'stories_id': stories_id}
    )

    db.query("""
        INSERT INTO stories_tags_map (stories_id, tags_id)
        VALUES (%(stories_id)s, %(tags_id)s)
        ON CONFLICT (stories_id, tags_id) DO NOTHING
    """, {
        'stories_id': stories_id,
        'tags_id': tags_id,
    })
Ejemplo n.º 2
0
def add_story(db: DatabaseHandler, story: dict, feeds_id: int, skip_checking_if_new: bool = False) -> Optional[dict]:
    """If the story is new, add story to the database with the feed of the download as story feed.

    Returns created story or None if story wasn't created.
    """

    story = decode_object_from_bytes_if_needed(story)
    if isinstance(feeds_id, bytes):
        feeds_id = decode_object_from_bytes_if_needed(feeds_id)
    feeds_id = int(feeds_id)
    if isinstance(skip_checking_if_new, bytes):
        skip_checking_if_new = decode_object_from_bytes_if_needed(skip_checking_if_new)
    skip_checking_if_new = bool(int(skip_checking_if_new))

    if db.in_transaction():
        raise McAddStoryException("add_story() can't be run from within transaction.")

    db.begin()

    db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE")

    if not skip_checking_if_new:
        if not is_new(db=db, story=story):
            log.debug("Story '{}' is not new.".format(story['url']))
            db.commit()
            return None

    medium = db.find_by_id(table='media', object_id=story['media_id'])

    if story.get('full_text_rss', None) is None:
        story['full_text_rss'] = medium.get('full_text_rss', False) or False
        if len(story.get('description', '')) == 0:
            story['full_text_rss'] = False

    try:
        story = db.create(table='stories', insert_hash=story)
    except Exception as ex:
        db.rollback()

        # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint
        if 'unique constraint \"stories_guid' in str(ex):
            log.warning(
                "Failed to add story for '{}' to GUID conflict (guid = '{}')".format(story['url'], story['guid'])
            )
            return None

        else:
            raise McAddStoryException("Error adding story: {}\nStory: {}".format(str(ex), str(story)))

    db.find_or_create(
        table='feeds_stories_map',
        insert_hash={
            'stories_id': story['stories_id'],
            'feeds_id': feeds_id,
        }
    )

    db.commit()

    return story
Ejemplo n.º 3
0
def update_extractor_version_tag(db: DatabaseHandler, stories_id: int, extractor_version: str) -> None:
    """Add extractor version tag to the story."""
    # FIXME no caching because unit tests run in the same process so a cached tag set / tag will not be recreated.
    # Purging such a cache manually is very error-prone.

    if isinstance(stories_id, bytes):
        stories_id = decode_object_from_bytes_if_needed(stories_id)
    stories_id = int(stories_id)

    extractor_version = decode_object_from_bytes_if_needed(extractor_version)

    tag_set = db.find_or_create(table='tag_sets', insert_hash={'name': extractor_version_tag_sets_name()})

    db.query("""
        DELETE FROM stories_tags_map AS stm
            USING tags AS t
                JOIN tag_sets AS ts
                    ON ts.tag_sets_id = t.tag_sets_id
        WHERE t.tags_id = stm.tags_id
          AND ts.tag_sets_id = %(tag_sets_id)s
          AND stm.stories_id = %(stories_id)s
    """, {
        'tag_sets_id': tag_set['tag_sets_id'],
        'stories_id': stories_id,
    })

    tag = db.find_or_create(table='tags', insert_hash={'tag': extractor_version, 'tag_sets_id': tag_set['tag_sets_id']})
    tags_id = tag['tags_id']

    db.query("""
        INSERT INTO stories_tags_map (stories_id, tags_id)
        VALUES (%(stories_id)s, %(tags_id)s)
    """, {'stories_id': stories_id, 'tags_id': tags_id})
Ejemplo n.º 4
0
def update_extractor_version_tag(db: DatabaseHandler, story: dict) -> None:
    """Add extractor version tag to the story."""
    # FIXME no caching because unit tests run in the same process so a cached tag set / tag will not be recreated.
    # Purging such a cache manually is very error-prone.

    story = decode_object_from_bytes_if_needed(story)

    tag_set = db.find_or_create(table='tag_sets', insert_hash={'name': extractor_version_tag_sets_name()})

    db.query("""
        DELETE FROM stories_tags_map AS stm
            USING tags AS t
                JOIN tag_sets AS ts
                    ON ts.tag_sets_id = t.tag_sets_id
        WHERE t.tags_id = stm.tags_id
          AND ts.tag_sets_id = %(tag_sets_id)s
          AND stm.stories_id = %(stories_id)s
    """, {
        'tag_sets_id': tag_set['tag_sets_id'],
        'stories_id': story['stories_id'],
    })

    extractor_version = extractor_name()
    tag = db.find_or_create(table='tags', insert_hash={'tag': extractor_version, 'tag_sets_id': tag_set['tag_sets_id']})
    tags_id = tag['tags_id']

    db.query("""
        INSERT INTO stories_tags_map (stories_id, tags_id)
        VALUES (%(stories_id)s, %(tags_id)s)
    """, {'stories_id': story['stories_id'], 'tags_id': tags_id})
Ejemplo n.º 5
0
def guess_medium(db: DatabaseHandler, story_url: str) -> dict:
    """Guess the media source for a story with the given url.

    The guess is based on a normalized version of the host part of the url.  The guess takes into account the
    duplicate media relationships included in the postgres database through the media.dup_media_id fields.  If
    no appropriate media source exists, this function will create a new one and return it.

    """
    (medium_url,
     medium_name) = generate_medium_url_and_name_from_url(story_url)

    medium = lookup_medium(db, medium_url, medium_name)

    if medium is not None:
        return medium

    normalized_medium_url = _normalize_url(medium_url)
    normalized_story_url = _normalize_url(story_url)
    all_urls = [
        normalized_medium_url, medium_url, normalized_story_url, story_url
    ]

    # avoid conflicts with existing media names and urls that are missed
    # by the above query b/c of dups feeds or foreign_rss_links
    medium_name = get_unique_medium_name(db, [medium_name] + all_urls)
    medium_url = get_unique_medium_url(db, all_urls)

    # a race condition with another thread can cause this to fail sometimes, but after the medium in the
    # other process has been created, all should be fine
    for i in range(_GUESS_MEDIUM_RETRIES):
        medium_data = {
            'name': medium_name,
            'url': medium_url,
            'normalized_url': normalized_medium_url
        }
        medium = db.find_or_create('media', medium_data)

        if medium is not None:
            break
        else:
            time.sleep(1)

    if medium is None:
        raise McTopicMediaUniqueException(
            "Unable to find or create medium for %s / %s" %
            (medium_name, medium_url))

    log.info("add medium: %s / %s / %d" %
             (medium_name, medium_url, medium['media_id']))

    spidered_tag = get_spidered_tag(db)

    db.find_or_create('media_tags_map', {
        'media_id': medium['media_id'],
        'tags_id': spidered_tag['tags_id']
    })

    return medium
Ejemplo n.º 6
0
def assign_date_guess_tag(db: DatabaseHandler, story: dict,
                          date_guess: GuessDateResult,
                          fallback_date: typing.Optional[str]) -> None:
    """Assign a guess method tag to the story based on the date_guess result.

    If date_guess found a result, assign a date_guess_method:guess_by_url, guess_by_tag_*, or guess_by_uknown tag.
    Otherwise if there is a fallback_date, assign date_guess_metehod:fallback_date.  Else assign
    date_invalid:date_invalid.

    Arguments:
    db - db handle
    story - story dict from db
    date_guess - GuessDateResult from guess_date() call

    Returns:
    None

    """
    if date_guess.found:
        tag_set = mediawords.tm.guess_date.GUESS_METHOD_TAG_SET
        guess_method = date_guess.guess_method
        if guess_method.startswith('Extracted from url'):
            tag = 'guess_by_url'
        elif guess_method.startswith('Extracted from tag'):
            match = re2.search(r'\<(\w+)', guess_method)
            html_tag = match.group(1) if match is not None else 'unknown'
            tag = 'guess_by_tag_' + str(html_tag)
        else:
            tag = 'guess_by_unknown'
    elif fallback_date is not None:
        tag_set = mediawords.tm.guess_date.GUESS_METHOD_TAG_SET
        tag = 'fallback_date'
    else:
        tag_set = mediawords.tm.guess_date.INVALID_TAG_SET
        tag = mediawords.tm.guess_date.INVALID_TAG

    ts = db.find_or_create('tag_sets', {'name': tag_set})
    t = db.find_or_create('tags', {
        'tag': tag,
        'tag_sets_id': ts['tag_sets_id']
    })

    db.query("delete from stories_tags_map where stories_id = %(a)s",
             {'a': story['stories_id']})
    db.query(
        "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)",
        {
            'a': story['stories_id'],
            'b': t['tags_id']
        })
Ejemplo n.º 7
0
def get_spider_feed(db: DatabaseHandler, medium: dict) -> dict:
    """Find or create the 'Spider Feed' feed for the media source."""

    feed = db.query(
        """
            SELECT *
            FROM feeds
            WHERE
                media_id = %(media_id)s AND
                name = %(name)s
        """,
        {
            'media_id': medium['media_id'],
            'name': SPIDER_FEED_NAME,
        }
    ).hash()

    if feed is not None:
        return feed

    return db.find_or_create(
        'feeds',
        {
            'media_id': medium['media_id'],
            'url': medium['url'] + '#spiderfeed',
            'name': SPIDER_FEED_NAME,
            'active': False,
        },
    )
Ejemplo n.º 8
0
def guess_medium(db: DatabaseHandler, story_url: str) -> dict:
    """Guess the media source for a story with the given url.

    The guess is based on a normalized version of the host part of the url.  The guess takes into account the
    duplicate media relationships included in the postgres database through the media.dup_media_id fields.  If
    no appropriate media source exists, this function will create a new one and return it.

    """
    (medium_url, medium_name) = generate_medium_url_and_name_from_url(story_url)

    medium = lookup_medium(db, medium_url, medium_name)

    if medium is not None:
        return medium

    normalized_medium_url = _normalize_url(medium_url)
    normalized_story_url = _normalize_url(story_url)
    all_urls = [normalized_medium_url, medium_url, normalized_story_url, story_url]

    # avoid conflicts with existing media names and urls that are missed
    # by the above query b/c of dups feeds or foreign_rss_links
    medium_name = get_unique_medium_name(db, [medium_name] + all_urls)
    medium_url = get_unique_medium_url(db, all_urls)

    # a race condition with another thread can cause this to fail sometimes, but after the medium in the
    # other process has been created, all should be fine
    for i in range(_GUESS_MEDIUM_RETRIES):
        medium_data = {'name': medium_name, 'url': medium_url, 'normalized_url': normalized_medium_url}
        medium = db.find_or_create('media', medium_data)

        if medium is not None:
            break
        else:
            time.sleep(1)

    if medium is None:
        raise McTopicMediaUniqueException(
            "Unable to find or create medium for %s / %s" % (medium_name, medium_url))

    log.info("add medium: %s / %s / %d" % (medium_name, medium_url, medium['media_id']))

    spidered_tag = get_spidered_tag(db)

    db.find_or_create('media_tags_map', {'media_id': medium['media_id'], 'tags_id': spidered_tag['tags_id']})

    return medium
Ejemplo n.º 9
0
def assign_date_guess_tag(
        db: DatabaseHandler,
        story: dict,
        date_guess: GuessDateResult,
        fallback_date: typing.Optional[str]) -> None:
    """Assign a guess method tag to the story based on the date_guess result.

    If date_guess found a result, assign a date_guess_method:guess_by_url, guess_by_tag_*, or guess_by_uknown tag.
    Otherwise if there is a fallback_date, assign date_guess_metehod:fallback_date.  Else assign
    date_invalid:date_invalid.

    Arguments:
    db - db handle
    story - story dict from db
    date_guess - GuessDateResult from guess_date() call

    Returns:
    None

    """
    if date_guess.found:
        tag_set = mediawords.tm.guess_date.GUESS_METHOD_TAG_SET
        guess_method = date_guess.guess_method
        if guess_method.startswith('Extracted from url'):
            tag = 'guess_by_url'
        elif guess_method.startswith('Extracted from tag'):
            match = re2.search(r'\<(\w+)', guess_method)
            html_tag = match.group(1) if match is not None else 'unknown'
            tag = 'guess_by_tag_' + str(html_tag)
        else:
            tag = 'guess_by_unknown'
    elif fallback_date is not None:
        tag_set = mediawords.tm.guess_date.GUESS_METHOD_TAG_SET
        tag = 'fallback_date'
    else:
        tag_set = mediawords.tm.guess_date.INVALID_TAG_SET
        tag = mediawords.tm.guess_date.INVALID_TAG

    ts = db.find_or_create('tag_sets', {'name': tag_set})
    t = db.find_or_create('tags', {'tag': tag, 'tag_sets_id': ts['tag_sets_id']})

    db.query("delete from stories_tags_map where stories_id = %(a)s", {'a': story['stories_id']})
    db.query(
        "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)",
        {'a': story['stories_id'], 'b': t['tags_id']})
Ejemplo n.º 10
0
def get_spidered_tag(db: DatabaseHandler) -> dict:
    """Return the spidered:spidered tag dict."""
    spidered_tag = db.query(
        """
        select t.*
            from tags t
                join tag_sets ts using ( tag_sets_id )
            where
                t.tag = %(a)s and
                ts.name = %(b)s
        """,
        {'a': SPIDERED_TAG_TAG, 'b': SPIDERED_TAG_SET}).hash()

    if spidered_tag is None:
        tag_set = db.find_or_create('tag_sets', {'name': SPIDERED_TAG_SET})
        spidered_tag = db.find_or_create('tags', {'tag': SPIDERED_TAG_TAG, 'tag_sets_id': tag_set['tag_sets_id']})

    return spidered_tag
Ejemplo n.º 11
0
def _import_ap_story(db: DatabaseHandler, ap_story: dict) -> None:
    """Given a ap story return by get_new_stories(), add it to the database."""
    ap_medium = db.query(
        """
        SELECT *
        FROM media
        WHERE name = %(medium_name)s
    """, {
            'medium_name': AP_MEDIUM_NAME,
        }).hash()
    ap_feed = {
        'media_id': ap_medium['media_id'],
        'name': 'API Feed',
        'active': False,
        'type': 'syndicated',
        'url': 'http://ap.com'
    }
    ap_feed = db.find_or_create('feeds', ap_feed)

    story = {
        'guid': ap_story['guid'],
        'url': ap_story['url'],
        'publish_date': ap_story['publish_date'],
        'title': ap_story['title'],
        'description': ap_story['description'],
        'media_id': ap_medium['media_id']
    }
    story = add_story(db, story, ap_feed['feeds_id'])

    if not story:
        return

    story_download = create_download_for_new_story(db, story, ap_feed)

    download_text = {
        'downloads_id': story_download['downloads_id'],
        'download_text': ap_story['text'],
        'download_text_length': len(ap_story['text'])
    }

    db.query(
        """
        INSERT INTO download_texts (downloads_id, download_text, download_text_length)
        VALUES (%(downloads_id)s, %(download_text)s, %(download_text_length)s)
        """, download_text)

    # Send to the extractor for it to do vectorization, language detection, etc.
    JobBroker(queue_name='MediaWords::Job::ExtractAndVector').add_to_queue(
        stories_id=story['stories_id'],
        use_existing=True,
    )
Ejemplo n.º 12
0
def get_spidered_tag(db: DatabaseHandler) -> dict:
    """Return the spidered:spidered tag dict."""
    spidered_tag = db.query(
        """
        select t.*
            from tags t
                join tag_sets ts using ( tag_sets_id )
            where
                t.tag = %(a)s and
                ts.name = %(b)s
        """, {
            'a': SPIDERED_TAG_TAG,
            'b': SPIDERED_TAG_SET
        }).hash()

    if spidered_tag is None:
        tag_set = db.find_or_create('tag_sets', {'name': SPIDERED_TAG_SET})
        spidered_tag = db.find_or_create('tags', {
            'tag': SPIDERED_TAG_TAG,
            'tag_sets_id': tag_set['tag_sets_id']
        })

    return spidered_tag
Ejemplo n.º 13
0
def get_spider_feed(db: DatabaseHandler, medium: dict) -> dict:
    """Find or create the 'Spider Feed' feed for the media source."""

    feed = db.query(
        "select * from feeds where media_id = %(a)s and name = %(b)s",
        {'a': medium['media_id'], 'b': SPIDER_FEED_NAME}).hash()

    if feed is not None:
        return feed

    return db.find_or_create('feeds', {
        'media_id': medium['media_id'],
        'url': medium['url'] + '#spiderfeed',
        'name': SPIDER_FEED_NAME,
        'feed_status': 'inactive'
    })
Ejemplo n.º 14
0
def get_spider_feed(db: DatabaseHandler, medium: dict) -> dict:
    """Find or create the 'Spider Feed' feed for the media source."""

    feed = db.query(
        "select * from feeds where media_id = %(a)s and name = %(b)s",
        {'a': medium['media_id'], 'b': SPIDER_FEED_NAME}).hash()

    if feed is not None:
        return feed

    return db.find_or_create('feeds', {
        'media_id': medium['media_id'],
        'url': medium['url'] + '#spiderfeed',
        'name': SPIDER_FEED_NAME,
        'active': False,
    })
Ejemplo n.º 15
0
def add_story(db: DatabaseHandler,
              story: dict,
              feeds_id: int,
              skip_checking_if_new: bool = False) -> Optional[dict]:
    """If the story is new, add story to the database with the feed of the download as story feed.

    Returns created story or None if story wasn't created.
    """

    story = decode_object_from_bytes_if_needed(story)
    if isinstance(feeds_id, bytes):
        feeds_id = decode_object_from_bytes_if_needed(feeds_id)
    feeds_id = int(feeds_id)
    if isinstance(skip_checking_if_new, bytes):
        skip_checking_if_new = decode_object_from_bytes_if_needed(
            skip_checking_if_new)
    skip_checking_if_new = bool(int(skip_checking_if_new))

    if db.in_transaction():
        raise McAddStoryException(
            "add_story() can't be run from within transaction.")

    db.begin()

    db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE")

    if not skip_checking_if_new:
        if not is_new(db=db, story=story):
            log.debug("Story '{}' is not new.".format(story['url']))
            db.commit()
            return None

    medium = db.find_by_id(table='media', object_id=story['media_id'])

    if story.get('full_text_rss', None) is None:
        story['full_text_rss'] = medium.get('full_text_rss', False) or False
        if len(story.get('description', '')) == 0:
            story['full_text_rss'] = False

    try:
        story = db.create(table='stories', insert_hash=story)
    except Exception as ex:
        db.rollback()

        # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint
        if 'unique constraint \"stories_guid' in str(ex):
            log.warning(
                "Failed to add story for '{}' to GUID conflict (guid = '{}')".
                format(story['url'], story['guid']))
            return None

        else:
            raise McAddStoryException(
                "Error adding story: {}\nStory: {}".format(
                    str(ex), str(story)))

    db.find_or_create(table='feeds_stories_map',
                      insert_hash={
                          'stories_id': story['stories_id'],
                          'feeds_id': feeds_id,
                      })

    db.commit()

    return story