def test_guess_date():
    with pytest.raises(McGuessDateException):
        # noinspection PyTypeChecker
        guess_date(url=None, html=None)
        # noinspection PyTypeChecker
        guess_date(url="https://www.nytimes.com/2017/10/some_news.html",
                   html=None)
        # noinspection PyTypeChecker
        guess_date(url=None, html="Something")

    # Found
    result = guess_date(url="https://www.nytimes.com/2017/10/some_news.html",
                        html="""
            <html><head>
            <meta property="article:published" itemprop="datePublished" content="2017-10-13T04:56:54-04:00" />
            </head></html>
        """)
    assert result.found is True
    assert result.guess_method.startswith('Extracted from')
    assert result.timestamp == 1507885014
    assert result.date == '2017-10-13T08:56:54'

    # Not found (undateable, even though the date is there in <meta />)
    result = guess_date(url="https://en.wikipedia.org/wiki/Progressive_tax",
                        html="""
            <html><head>
            <meta property="article:published" itemprop="datePublished" content="2017-10-13T04:56:54-04:00" />
            </head></html>
        """)
    assert result.found is False
    assert result.guess_method is None
    assert result.timestamp is None
    assert result.date is None
def benchmark_date_guessing():
    """Benchmark Python date guessing code."""
    if len(sys.argv) < 2:
        sys.exit("Usage: %s <directory of html files>" % sys.argv[0])

    directory = sys.argv[1]

    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.endswith(".txt"):
            fh = open(os.path.join(directory, filename))
            content = fh.read()
            print(filename + ": " + str(len(content)))
            date_guess = guess_date(
                url='http://dont.know.the.date/some/path.html', html=content)
            print(date_guess.date)
Exemple #3
0
def generate_story(
        db: DatabaseHandler,
        url: str,
        content: str,
        fallback_date: typing.Optional[datetime.datetime] = None) -> dict:
    """Add a new story to the database by guessing metadata using the given url and content.

    This function guesses the medium, feed, title, and date of the story from the url and content.

    Arguments:
    db - db handle
    url - story url
    content - story content
    fallback_date - fallback to this date if the date guesser fails to find a date
    """
    if len(url) < 1:
        raise McTMStoriesException("url must not be an empty string")

    url = url[0:_MAX_URL_LENGTH]

    medium = mediawords.tm.media.guess_medium(db, url)
    feed = get_spider_feed(db, medium)
    spidered_tag = mediawords.tm.media.get_spidered_tag(db)
    title = mediawords.util.parse_html.html_title(content, url,
                                                  _MAX_TITLE_LENGTH)

    story = {
        'url': url,
        'guid': url,
        'media_id': medium['media_id'],
        'title': title,
        'description': ''
    }

    # postgres refuses to insert text values with the null character
    for field in ('url', 'guid', 'title'):
        story[field] = re2.sub('\x00', '', story[field])

    date_guess = guess_date(url, content)
    story[
        'publish_date'] = date_guess.date if date_guess.found else fallback_date
    if story['publish_date'] is None:
        story['publish_date'] = datetime.datetime.now().isoformat()

    try:
        story = db.create('stories', story)
    except mediawords.db.exceptions.handler.McUniqueConstraintException:
        raise McTMStoriesDuplicateException(
            "Attempt to insert duplicate story url %s" % url)
    except Exception:
        raise McTMStoriesException("Error adding story: %s" %
                                   traceback.format_exc())

    db.query(
        "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)",
        {
            'a': story['stories_id'],
            'b': spidered_tag['tags_id']
        })

    assign_date_guess_tag(db, story, date_guess, fallback_date)

    log.debug("add story: %s; %s; %s; %d" %
              (story['title'], story['url'], story['publish_date'],
               story['stories_id']))

    db.create('feeds_stories_map', {
        'stories_id': story['stories_id'],
        'feeds_id': feed['feeds_id']
    })

    download = create_download_for_new_story(db, story, feed)

    mediawords.dbi.downloads.store_content(db, download, content)

    _extract_story(db, story)

    return story
Exemple #4
0
def generate_story(
        db: DatabaseHandler,
        url: str,
        content: str,
        title: str = None,
        publish_date: datetime.datetime = None,
        fallback_date: typing.Optional[datetime.datetime] = None) -> dict:
    """Add a new story to the database by guessing metadata using the given url and content.

    This function guesses the medium, feed, title, and date of the story from the url and content.

    If inserting the story results in a unique constraint error based on media_id and url, return
    the existing story instead.

    Arguments:
    db - db handle
    url - story url
    content - story content
    fallback_date - fallback to this date if the date guesser fails to find a date
    """
    if len(url) < 1:
        raise McTMStoriesException("url must not be an empty string")

    url = url[0:mediawords.dbi.stories.stories.MAX_URL_LENGTH]

    medium = mediawords.tm.media.guess_medium(db, url)
    feed = get_spider_feed(db, medium)
    spidered_tag = mediawords.tm.media.get_spidered_tag(db)

    if title is None:
        title = mediawords.util.parse_html.html_title(
            content, url, mediawords.dbi.stories.stories.MAX_TITLE_LENGTH)

    story = {
        'url': url,
        'guid': url,
        'media_id': medium['media_id'],
        'title': title,
        'description': ''
    }

    # postgres refuses to insert text values with the null character
    for field in ('url', 'guid', 'title'):
        story[field] = re2.sub('\x00', '', story[field])

    if publish_date is None:
        date_guess = guess_date(url, content)
        story[
            'publish_date'] = date_guess.date if date_guess.found else fallback_date
        if story['publish_date'] is None:
            story['publish_date'] = datetime.datetime.now().isoformat()
    else:
        story['publish_date'] = publish_date

    story = mediawords.dbi.stories.stories.add_story(db, story,
                                                     feed['feeds_id'])

    db.query(
        """
        insert into stories_tags_map (stories_id, tags_id)
            select %(a)s, %(b)s where not exists (
                select 1 from stories_tags_map where stories_id = %(a)s and tags_id = %(b)s )
        """, {
            'a': story['stories_id'],
            'b': spidered_tag['tags_id']
        })

    if publish_date is None:
        assign_date_guess_tag(db, story, date_guess, fallback_date)

    log.debug("add story: %s; %s; %s; %d" %
              (story['title'], story['url'], story['publish_date'],
               story['stories_id']))

    if story.get('is_new', False):
        download = create_download_for_new_story(db, story, feed)
        mediawords.dbi.downloads.store_content(db, download, content)
        _extract_story(db, story)

    return story
Exemple #5
0
def generate_story(
        db: DatabaseHandler,
        url: str,
        content: str,
        title: str = None,
        publish_date: datetime.datetime = None,
        fallback_date: typing.Optional[datetime.datetime] = None) -> dict:
    """Add a new story to the database by guessing metadata using the given url and content.

    This function guesses the medium, feed, title, and date of the story from the url and content.

    If inserting the story results in a unique constraint error based on media_id and url, return
    the existing story instead.

    Arguments:
    db - db handle
    url - story url
    content - story content
    fallback_date - fallback to this date if the date guesser fails to find a date
    """
    if len(url) < 1:
        raise McTMStoriesException("url must not be an empty string")

    url = url[0:mediawords.dbi.stories.stories.MAX_URL_LENGTH]

    medium = mediawords.tm.media.guess_medium(db, url)
    feed = get_spider_feed(db, medium)
    spidered_tag = mediawords.tm.media.get_spidered_tag(db)

    if title is None:
        title = mediawords.util.parse_html.html_title(content, url, mediawords.dbi.stories.stories.MAX_TITLE_LENGTH)

    story = {
        'url': url,
        'guid': url,
        'media_id': medium['media_id'],
        'title': title,
        'description': ''
    }

    # postgres refuses to insert text values with the null character
    for field in ('url', 'guid', 'title'):
        story[field] = re2.sub('\x00', '', story[field])

    if publish_date is None:
        date_guess = guess_date(url, content)
        story['publish_date'] = date_guess.date if date_guess.found else fallback_date
        if story['publish_date'] is None:
            story['publish_date'] = datetime.datetime.now().isoformat()
    else:
        story['publish_date'] = publish_date

    try:
        story = db.create('stories', story)
    except mediawords.db.exceptions.handler.McUniqueConstraintException:
        return mediawords.tm.stories.get_story_match(db=db, url=story['url'])
    except Exception:
        raise McTMStoriesException("Error adding story: %s" % traceback.format_exc())

    db.query(
        "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)",
        {'a': story['stories_id'], 'b': spidered_tag['tags_id']})

    if publish_date is None:
        assign_date_guess_tag(db, story, date_guess, fallback_date)

    log.debug("add story: %s; %s; %s; %d" % (story['title'], story['url'], story['publish_date'], story['stories_id']))

    db.create('feeds_stories_map', {'stories_id': story['stories_id'], 'feeds_id': feed['feeds_id']})

    download = create_download_for_new_story(db, story, feed)

    mediawords.dbi.downloads.store_content(db, download, content)

    _extract_story(db, story)

    return story