Esempio n. 1
0
def test_get_spidered_tag():
    db = connect_to_db()

    tag = get_spidered_tag(db)

    assert tag['tag'] == SPIDERED_TAG_TAG

    tag_set = db.require_by_id('tag_sets', tag['tag_sets_id'])
    assert tag_set['name'] == SPIDERED_TAG_SET

    assert get_spidered_tag(db)['tags_id'] == tag['tags_id']
def test_guess_medium() -> None:
    """Test guess_medium()."""
    db = connect_to_db()

    num_media = 5
    [create_test_medium(db, str(i)) for i in range(num_media)]

    # the default test media do not have unique domains
    # noinspection SqlWithoutWhere
    db.query("update media set url = 'http://media-' || media_id ||'.com'")

    # dummy guess_medium call to assign normalized_urls
    guess_medium(db, 'foo')

    media = db.query("select * from media order by media_id").hashes()

    # basic lookup of existing media
    assert guess_medium(db, media[0]['url']) == media[0]
    assert guess_medium(db, media[1]['url'] + '/foo/bar/') == media[1]
    assert guess_medium(db, media[2]['url'] + URL_SPIDERED_SUFFIX) == media[2]

    # create a new medium
    new_medium_story_url = 'http://new-medium.com/with/path'
    new_medium = guess_medium(db, new_medium_story_url)
    assert new_medium['name'] == 'new-medium.com'
    assert new_medium['url'] == 'http://new-medium.com/'

    spidered_tag = get_spidered_tag(db)
    spidered_mtm = db.query(
        "select * from media_tags_map where tags_id = %(a)s and media_id = %(b)s",
        {'a': spidered_tag['tags_id'], 'b': new_medium['media_id']})
    assert spidered_mtm is not None

    # find the url with some url varients
    new_medium_url_variants = [
        'http://new-medium.com/with/another/path',
        'http://www.new-medium.com/',
        'http://new-medium.com/with/path#andanchor'
    ]

    for url in new_medium_url_variants:
        assert guess_medium(db, url)['media_id'] == new_medium['media_id']

    # set foreign_rss_links to true to make guess_medium create another new medium
    db.query("update media set foreign_rss_links = 't' where media_id = %(a)s", {'a': new_medium['media_id']})

    another_new_medium = guess_medium(db, new_medium_story_url)
    assert another_new_medium['media_id'] > new_medium['media_id']
    assert another_new_medium['url'] == new_medium_story_url
    assert another_new_medium['name'] == 'http://new-medium.com/'

    # now try finding a dup
    db.query(
        "update media set dup_media_id = %(a)s where media_id = %(b)s",
        {'a': media[0]['media_id'], 'b': media[1]['media_id']})

    assert guess_medium(db, media[1]['url'])['media_id'] == media[0]['media_id']
Esempio n. 3
0
def generate_story(db: DatabaseHandler,
                   url: str,
                   content: str,
                   title: str = None,
                   publish_date: str = None,
                   fallback_date: Optional[str] = None) -> dict:
    """Add a new story to the database by guessing metadata using the given url and content.

    This function guesses the medium, feed, title, and date of the story from the url and content.

    If inserting the story results in a unique constraint error based on media_id and url, return
    the existing story instead.

    Arguments:
    db - db handle
    url - story url
    content - story content
    fallback_date - fallback to this date if the date guesser fails to find a date
    """
    if len(url) < 1:
        raise McTMStoriesException("url must not be an empty string")

    url = url[0:MAX_URL_LENGTH]

    medium = guess_medium(db, url)
    feed = get_spider_feed(db, medium)
    spidered_tag = get_spidered_tag(db)

    if title is None:
        title = html_title(content, url, MAX_TITLE_LENGTH)

    story = {
        'url': url,
        'guid': url,
        'media_id': medium['media_id'],
        'title': title,
        'description': ''
    }

    # postgres refuses to insert text values with the null character
    for field in ('url', 'guid', 'title'):
        story[field] = re2.sub('\x00', '', story[field])

    date_guess = None
    if publish_date is None:
        date_guess = guess_date(url, content)
        story[
            'publish_date'] = date_guess.date if date_guess.found else fallback_date
        if story['publish_date'] is None:
            story['publish_date'] = datetime.datetime.now().isoformat()
    else:
        story['publish_date'] = publish_date

    story = add_story(db, story, feed['feeds_id'])

    db.query(
        """
        insert into stories_tags_map (stories_id, tags_id)
            select %(a)s, %(b)s
            where not exists (
                select 1
                from stories_tags_map
                where stories_id = %(a)s
                  and tags_id = %(b)s
            )
        """, {
            'a': story['stories_id'],
            'b': spidered_tag['tags_id']
        })

    if publish_date is None:
        assign_date_guess_tag(db, story, date_guess, fallback_date)

    log.debug("add story: %s; %s; %s; %d" %
              (story['title'], story['url'], story['publish_date'],
               story['stories_id']))

    if story.get('is_new', False):
        download = create_download_for_new_story(db, story, feed)
        store_content(db, download, content)
        _extract_story(story)

    return story
Esempio n. 4
0
def generate_story(db: DatabaseHandler,
                   url: str,
                   content: str,
                   title: str = None,
                   publish_date: str = None,
                   fallback_date: Optional[str] = None) -> dict:
    """Add a new story to the database by guessing metadata using the given url and content.

    This function guesses the medium, feed, title, and date of the story from the url and content.

    If inserting the story results in a unique constraint error based on media_id and url, return
    the existing story instead.

    Arguments:
    db - db handle
    url - story url
    content - story content
    fallback_date - fallback to this date if the date guesser fails to find a date
    """
    if len(url) < 1:
        raise McTMStoriesException("url must not be an empty string")

    log.debug(f"Generating story from URL {url}...")

    url = url[0:MAX_URL_LENGTH]

    log.debug(f"Guessing medium for URL {url}...")
    medium = guess_medium(db, url)
    log.debug(f"Done guessing medium for URL {url}: {medium}")

    log.debug(f"Getting spider feed for medium {medium}...")
    feed = get_spider_feed(db, medium)
    log.debug(f"Done getting spider feed for medium {medium}: {feed}")

    log.debug(f"Getting spidered tag...")
    spidered_tag = get_spidered_tag(db)
    log.debug(f"Done getting spidered tag: {spidered_tag}")

    if title is None:
        log.debug(f"Parsing HTML title...")
        title = html_title(content, url, MAX_TITLE_LENGTH)
        log.debug(f"Done parsing HTML title: {title}")

    story = {
        'url': url,
        'guid': url,
        'media_id': medium['media_id'],
        'title': title,
        'description': ''
    }

    # postgres refuses to insert text values with the null character
    for field in ('url', 'guid', 'title'):
        story[field] = re2.sub('\x00', '', story[field])

    date_guess = None
    if publish_date is None:
        log.debug(f"Guessing date for URL {url}...")
        date_guess = guess_date(url, content)
        log.debug(f"Done guessing date for URL {url}: {date_guess}")

        story['publish_date'] = date_guess.date if date_guess.found else None
    else:
        story['publish_date'] = publish_date

    log.debug(f"Adding story {story}...")
    story = add_story(db, story, feed['feeds_id'])
    log.debug(f"Done adding story {story}")

    db.query(
        """
        insert into stories_tags_map (stories_id, tags_id)
            select %(a)s, %(b)s
            where not exists (
                select 1
                from stories_tags_map
                where stories_id = %(a)s
                  and tags_id = %(b)s
            )
        """, {
            'a': story['stories_id'],
            'b': spidered_tag['tags_id']
        })

    if publish_date is None:
        log.debug(f"Assigning date guess tag...")
        assign_date_guess_tag(db, story, date_guess, fallback_date)

    log.debug("add story: %s; %s; %s; %d" %
              (story['title'], story['url'], story['publish_date'],
               story['stories_id']))

    if story.get('is_new', False):
        log.debug("Story is new, creating download...")
        download = create_download_for_new_story(db, story, feed)

        log.debug("Storing story content...")
        store_and_verify_content(db, download, content)

        log.debug("Extracting story...")
        _extract_story(db, story)
        log.debug("Done extracting story")

    else:
        log.debug("Story is not new, skipping download storage and extraction")

    log.debug(f"Done generating story from URL {url}")

    return story