def test_guess_date():
    with pytest.raises(McGuessDateException):
        # noinspection PyTypeChecker
        guess_date(url=None, html=None)
        # noinspection PyTypeChecker
        guess_date(url="https://www.nytimes.com/2017/10/some_news.html",
                   html=None)
        # noinspection PyTypeChecker
        guess_date(url=None, html="Something")

    # Found
    result = guess_date(url="https://www.nytimes.com/2017/10/some_news.html",
                        html="""
            <html><head>
            <meta property="article:published" itemprop="datePublished" content="2017-10-13T04:56:54-04:00" />
            </head></html>
        """)
    assert result.found is True
    assert result.guess_method.startswith('Extracted from')
    assert result.timestamp == 1507885014
    assert result.date == '2017-10-13T08:56:54'

    # Not found (undateable, even though the date is there in <meta />)
    result = guess_date(url="https://en.wikipedia.org/wiki/Progressive_tax",
                        html="""
            <html><head>
            <meta property="article:published" itemprop="datePublished" content="2017-10-13T04:56:54-04:00" />
            </head></html>
        """)
    assert result.found is False
    assert result.guess_method is None
    assert result.timestamp is None
    assert result.date is None
Esempio n. 2
0
def generate_story(db: DatabaseHandler,
                   url: str,
                   content: str,
                   title: str = None,
                   publish_date: str = None,
                   fallback_date: Optional[str] = None) -> dict:
    """Add a new story to the database by guessing metadata using the given url and content.

    This function guesses the medium, feed, title, and date of the story from the url and content.

    If inserting the story results in a unique constraint error based on media_id and url, return
    the existing story instead.

    Arguments:
    db - db handle
    url - story url
    content - story content
    fallback_date - fallback to this date if the date guesser fails to find a date
    """
    if len(url) < 1:
        raise McTMStoriesException("url must not be an empty string")

    url = url[0:MAX_URL_LENGTH]

    medium = guess_medium(db, url)
    feed = get_spider_feed(db, medium)
    spidered_tag = get_spidered_tag(db)

    if title is None:
        title = html_title(content, url, MAX_TITLE_LENGTH)

    story = {
        'url': url,
        'guid': url,
        'media_id': medium['media_id'],
        'title': title,
        'description': ''
    }

    # postgres refuses to insert text values with the null character
    for field in ('url', 'guid', 'title'):
        story[field] = re2.sub('\x00', '', story[field])

    date_guess = None
    if publish_date is None:
        date_guess = guess_date(url, content)
        story[
            'publish_date'] = date_guess.date if date_guess.found else fallback_date
        if story['publish_date'] is None:
            story['publish_date'] = datetime.datetime.now().isoformat()
    else:
        story['publish_date'] = publish_date

    story = add_story(db, story, feed['feeds_id'])

    db.query(
        """
        insert into stories_tags_map (stories_id, tags_id)
            select %(a)s, %(b)s
            where not exists (
                select 1
                from stories_tags_map
                where stories_id = %(a)s
                  and tags_id = %(b)s
            )
        """, {
            'a': story['stories_id'],
            'b': spidered_tag['tags_id']
        })

    if publish_date is None:
        assign_date_guess_tag(db, story, date_guess, fallback_date)

    log.debug("add story: %s; %s; %s; %d" %
              (story['title'], story['url'], story['publish_date'],
               story['stories_id']))

    if story.get('is_new', False):
        download = create_download_for_new_story(db, story, feed)
        store_content(db, download, content)
        _extract_story(story)

    return story
Esempio n. 3
0
def generate_story(db: DatabaseHandler,
                   url: str,
                   content: str,
                   title: str = None,
                   publish_date: str = None,
                   fallback_date: Optional[str] = None) -> dict:
    """Add a new story to the database by guessing metadata using the given url and content.

    This function guesses the medium, feed, title, and date of the story from the url and content.

    If inserting the story results in a unique constraint error based on media_id and url, return
    the existing story instead.

    Arguments:
    db - db handle
    url - story url
    content - story content
    fallback_date - fallback to this date if the date guesser fails to find a date
    """
    if len(url) < 1:
        raise McTMStoriesException("url must not be an empty string")

    log.debug(f"Generating story from URL {url}...")

    url = url[0:MAX_URL_LENGTH]

    log.debug(f"Guessing medium for URL {url}...")
    medium = guess_medium(db, url)
    log.debug(f"Done guessing medium for URL {url}: {medium}")

    log.debug(f"Getting spider feed for medium {medium}...")
    feed = get_spider_feed(db, medium)
    log.debug(f"Done getting spider feed for medium {medium}: {feed}")

    log.debug(f"Getting spidered tag...")
    spidered_tag = get_spidered_tag(db)
    log.debug(f"Done getting spidered tag: {spidered_tag}")

    if title is None:
        log.debug(f"Parsing HTML title...")
        title = html_title(content, url, MAX_TITLE_LENGTH)
        log.debug(f"Done parsing HTML title: {title}")

    story = {
        'url': url,
        'guid': url,
        'media_id': medium['media_id'],
        'title': title,
        'description': ''
    }

    # postgres refuses to insert text values with the null character
    for field in ('url', 'guid', 'title'):
        story[field] = re2.sub('\x00', '', story[field])

    date_guess = None
    if publish_date is None:
        log.debug(f"Guessing date for URL {url}...")
        date_guess = guess_date(url, content)
        log.debug(f"Done guessing date for URL {url}: {date_guess}")

        story['publish_date'] = date_guess.date if date_guess.found else None
    else:
        story['publish_date'] = publish_date

    log.debug(f"Adding story {story}...")
    story = add_story(db, story, feed['feeds_id'])
    log.debug(f"Done adding story {story}")

    db.query(
        """
        insert into stories_tags_map (stories_id, tags_id)
            select %(a)s, %(b)s
            where not exists (
                select 1
                from stories_tags_map
                where stories_id = %(a)s
                  and tags_id = %(b)s
            )
        """, {
            'a': story['stories_id'],
            'b': spidered_tag['tags_id']
        })

    if publish_date is None:
        log.debug(f"Assigning date guess tag...")
        assign_date_guess_tag(db, story, date_guess, fallback_date)

    log.debug("add story: %s; %s; %s; %d" %
              (story['title'], story['url'], story['publish_date'],
               story['stories_id']))

    if story.get('is_new', False):
        log.debug("Story is new, creating download...")
        download = create_download_for_new_story(db, story, feed)

        log.debug("Storing story content...")
        store_and_verify_content(db, download, content)

        log.debug("Extracting story...")
        _extract_story(db, story)
        log.debug("Done extracting story")

    else:
        log.debug("Story is not new, skipping download storage and extraction")

    log.debug(f"Done generating story from URL {url}")

    return story