Exemple #1
0
def test_add_story_description_unset():
    """Test adding a story without a description being set."""

    db = connect_to_db()

    medium = create_test_medium(db=db, label='test')
    feed = create_test_feed(db=db, label='test', medium=medium)

    story = {
        'url': 'http://test',
        'guid': 'http://test',
        'media_id': medium['media_id'],
        'title': "test",

        # stories.description can be NULL so it's a valid value:
        'description': None,
        'publish_date': '2016-10-15 08:00:00',
        'collect_date': '2016-10-15 10:00:00',
    }

    add_story(db=db, story=story, feeds_id=feed['feeds_id'])

    assert len(db.select(table='stories', what_to_select='*').hashes()) == 1
    assert len(
        db.select(table='feeds_stories_map', what_to_select='*').hashes()) == 1
Exemple #2
0
    def test_add_story_full_text_rss(self):
        """Test add_story() with only parent media's full_text_rss set to True."""

        media_id = self.test_medium['media_id']
        feeds_id = self.test_feed['feeds_id']

        self.db().update_by_id(
            table='media',
            object_id=media_id,
            update_hash={'full_text_rss': True},
        )

        story = {
            'media_id': media_id,
            'url': 'http://add.story/',
            'guid': 'http://add.story/',
            'title': 'test add story',
            'description': 'test add story',
            'publish_date': '2016-10-15 08:00:00',
            'collect_date': '2016-10-15 10:00:00',
            # 'full_text_rss' to be inferred from parent "media" item
        }
        added_story = add_story(db=self.db(), story=story, feeds_id=feeds_id)
        assert added_story
        assert 'stories_id' in added_story
        assert story['url'] == added_story['url']
        assert added_story['full_text_rss'] is True
Exemple #3
0
    def test_add_story(self):
        """Test add_story()."""

        media_id = self.test_medium['media_id']
        feeds_id = self.test_feed['feeds_id']

        # Basic story
        story = {
            'media_id': media_id,
            'url': 'http://add.story/',
            'guid': 'http://add.story/',
            'title': 'test add story',
            'description': 'test add story',
            'publish_date': '2016-10-15 08:00:00',
            'collect_date': '2016-10-15 10:00:00',
            'full_text_rss': True,
        }
        added_story = add_story(db=self.db(), story=story, feeds_id=feeds_id)
        assert added_story
        assert 'stories_id' in added_story
        assert story['url'] == added_story['url']
        assert added_story['full_text_rss'] is True

        feeds_stories_tag_mapping = self.db().select(
            table='feeds_stories_map',
            what_to_select='*',
            condition_hash={
                'stories_id': added_story['stories_id'],
                'feeds_id': feeds_id,
            }).hashes()
        assert len(feeds_stories_tag_mapping) == 1

        story_urls = self.db().query(
            "select * from story_urls where stories_id = %(a)s", {
                'a': added_story['stories_id']
            }).hashes()
        assert len(story_urls) == 1
        assert story_urls[0]['url'] == added_story['url']

        # Try adding a duplicate story
        dup_story = add_story(db=self.db(), story=story, feeds_id=feeds_id)
        assert dup_story is not None
        assert dup_story['stories_id'] == added_story['stories_id']
Exemple #4
0
    def test_add_story(self):
        """Test add_story()."""

        media_id = self.test_medium['media_id']
        feeds_id = self.test_feed['feeds_id']

        # Basic story
        story = {
            'media_id': media_id,
            'url': 'http://add.story/',
            'guid': 'http://add.story/',
            'title': 'test add story',
            'description': 'test add story',
            'publish_date': '2016-10-15 08:00:00',
            'collect_date': '2016-10-15 10:00:00',
            'full_text_rss': True,
        }
        added_story = add_story(db=self.db(), story=story, feeds_id=feeds_id)
        assert added_story
        assert 'stories_id' in added_story
        assert story['url'] == added_story['url']
        assert added_story['full_text_rss'] is True

        feeds_stories_tag_mapping = self.db().select(
            table='feeds_stories_map',
            what_to_select='*',
            condition_hash={
                'stories_id': added_story['stories_id'],
                'feeds_id': feeds_id,
            }).hashes()
        assert len(feeds_stories_tag_mapping) == 1

        # Try adding a duplicate story
        added_story = add_story(db=self.db(), story=story, feeds_id=feeds_id)
        assert added_story is None

        # Try adding a duplicate story with explicit "is new" testing disabled
        added_story = add_story(db=self.db(),
                                story=story,
                                feeds_id=feeds_id,
                                skip_checking_if_new=True)
        assert added_story is None
Exemple #5
0
def _import_ap_story(db: DatabaseHandler, ap_story: dict) -> None:
    """Given a ap story return by get_new_stories(), add it to the database."""
    ap_medium = db.query(
        """
        SELECT *
        FROM media
        WHERE name = %(medium_name)s
    """, {
            'medium_name': AP_MEDIUM_NAME,
        }).hash()
    ap_feed = {
        'media_id': ap_medium['media_id'],
        'name': 'API Feed',
        'active': False,
        'type': 'syndicated',
        'url': 'http://ap.com'
    }
    ap_feed = db.find_or_create('feeds', ap_feed)

    story = {
        'guid': ap_story['guid'],
        'url': ap_story['url'],
        'publish_date': ap_story['publish_date'],
        'title': ap_story['title'],
        'description': ap_story['description'],
        'media_id': ap_medium['media_id']
    }
    story = add_story(db, story, ap_feed['feeds_id'])

    if not story:
        return

    story_download = create_download_for_new_story(db, story, ap_feed)

    download_text = {
        'downloads_id': story_download['downloads_id'],
        'download_text': ap_story['text'],
        'download_text_length': len(ap_story['text'])
    }

    db.query(
        """
        INSERT INTO download_texts (downloads_id, download_text, download_text_length)
        VALUES (%(downloads_id)s, %(download_text)s, %(download_text_length)s)
        """, download_text)

    # Send to the extractor for it to do vectorization, language detection, etc.
    JobBroker(queue_name='MediaWords::Job::ExtractAndVector').add_to_queue(
        stories_id=story['stories_id'],
        use_existing=True,
    )
def add_story_and_content_download(db: DatabaseHandler, story: dict,
                                   parent_download: dict) -> Optional[dict]:
    """If the story is new, add it to the database and also add a pending download for the story content."""
    story = decode_object_from_bytes_if_needed(story)
    parent_download = decode_object_from_bytes_if_needed(parent_download)

    story = add_story(db=db, story=story, feeds_id=parent_download['feeds_id'])

    if story:
        if story.get('is_new', False):
            _create_child_download_for_story(db=db,
                                             story=story,
                                             parent_download=parent_download)

    return story
    def add_stories_from_feed(self, db: DatabaseHandler, download: dict,
                              content: str) -> List[int]:
        """
        Handle feeds of type 'web_page' by just creating a story to associate with the content.

        Web page feeds are feeds that consist of a web page that we download once a week and add as a story.
        """
        download = decode_object_from_bytes_if_needed(download)
        content = decode_object_from_bytes_if_needed(content)

        feeds_id = download['feeds_id']

        feed = db.find_by_id(table='feeds', object_id=feeds_id)

        title = html_title(html=content, fallback='(no title)')
        title += '[' + sql_now() + ']'

        guid = f"{str(int(time.time()))}:{download['url']}"[0:1024]

        new_story = {
            'url': download['url'],
            'guid': guid,
            'media_id': feed['media_id'],
            'publish_date': sql_now(),
            'title': title,
        }

        story = add_story(db=db, story=new_story, feeds_id=feeds_id)
        if not story:
            raise McCrawlerFetcherSoftError(f"Failed to add story {new_story}")

        db.query(
            """
            UPDATE downloads
            SET stories_id = %(stories_id)s,
                type = 'content'
            WHERE downloads_id = %(downloads_id)s
        """, {
                'stories_id': story['stories_id'],
                'downloads_id': download['downloads_id'],
            })

        # A webpage that was just fetched is also a story
        story_ids = [
            story['stories_id'],
        ]

        return story_ids
    def add_stories_from_feed(self, db: DatabaseHandler, download: dict,
                              content: str) -> List[int]:
        """
        Parse the feed content; create a story dict for each parsed story; check for a new URL since the last feed
        download; if there is a new URL, check whether each story is new, and if so add it to the database and add a
        pending download for it. Return new stories that were found in the feed.
        """
        download = decode_object_from_bytes_if_needed(download)
        content = decode_object_from_bytes_if_needed(content)

        media_id = get_media_id(db=db, download=download)
        download_time = download['download_time']

        try:
            stories = self._get_stories_from_syndicated_feed(
                content=content,
                media_id=media_id,
                download_time=download_time,
            )
        except Exception as ex:
            raise McCrawlerFetcherSoftError(
                f"Error processing feed for {download['url']}: {ex}")

        if stories_checksum_matches_feed(db=db,
                                         feeds_id=download['feeds_id'],
                                         stories=stories):
            return []

        new_story_ids = []
        for story in stories:

            # FIXME None of the helpers like keys they don't know about
            story_without_enclosures = story.copy()
            story_without_enclosures.pop('enclosures')

            if self._add_content_download_for_new_stories():
                added_story = add_story_and_content_download(
                    db=db,
                    story=story_without_enclosures,
                    parent_download=download,
                )
            else:
                added_story = add_story(
                    db=db,
                    story=story_without_enclosures,
                    feeds_id=download['feeds_id'],
                )

            # We might have received None due to a GUID conflict
            if added_story:

                stories_id = added_story['stories_id']
                story_is_new = added_story.get('is_new', False)

                if story_is_new:

                    # Add all of the enclosures
                    for enclosure in story['enclosures']:
                        # ...provided that the URL is set
                        if enclosure['url']:

                            db.query(
                                """
                                INSERT INTO story_enclosures (stories_id, url, mime_type, length)
                                VALUES (%(stories_id)s, %(url)s, %(mime_type)s, %(length)s)
                                
                                -- Some stories have multiple enclosures pointing to the same URL
                                ON CONFLICT (stories_id, url) DO NOTHING
                            """, {
                                    'stories_id': stories_id,
                                    'url': enclosure['url'],
                                    'mime_type': enclosure['mime_type'],
                                    'length': enclosure['length'],
                                })

                    # Append to the list of newly added storyes
                    new_story_ids.append(stories_id)

        log.info(
            f"add_stories_from_feed: new stories: {len(new_story_ids)} / {len(stories)}"
        )

        return new_story_ids
Exemple #9
0
def generate_story(db: DatabaseHandler,
                   url: str,
                   content: str,
                   title: str = None,
                   publish_date: str = None,
                   fallback_date: Optional[str] = None) -> dict:
    """Add a new story to the database by guessing metadata using the given url and content.

    This function guesses the medium, feed, title, and date of the story from the url and content.

    If inserting the story results in a unique constraint error based on media_id and url, return
    the existing story instead.

    Arguments:
    db - db handle
    url - story url
    content - story content
    fallback_date - fallback to this date if the date guesser fails to find a date
    """
    if len(url) < 1:
        raise McTMStoriesException("url must not be an empty string")

    url = url[0:MAX_URL_LENGTH]

    medium = guess_medium(db, url)
    feed = get_spider_feed(db, medium)
    spidered_tag = get_spidered_tag(db)

    if title is None:
        title = html_title(content, url, MAX_TITLE_LENGTH)

    story = {
        'url': url,
        'guid': url,
        'media_id': medium['media_id'],
        'title': title,
        'description': ''
    }

    # postgres refuses to insert text values with the null character
    for field in ('url', 'guid', 'title'):
        story[field] = re2.sub('\x00', '', story[field])

    date_guess = None
    if publish_date is None:
        date_guess = guess_date(url, content)
        story[
            'publish_date'] = date_guess.date if date_guess.found else fallback_date
        if story['publish_date'] is None:
            story['publish_date'] = datetime.datetime.now().isoformat()
    else:
        story['publish_date'] = publish_date

    story = add_story(db, story, feed['feeds_id'])

    db.query(
        """
        insert into stories_tags_map (stories_id, tags_id)
            select %(a)s, %(b)s
            where not exists (
                select 1
                from stories_tags_map
                where stories_id = %(a)s
                  and tags_id = %(b)s
            )
        """, {
            'a': story['stories_id'],
            'b': spidered_tag['tags_id']
        })

    if publish_date is None:
        assign_date_guess_tag(db, story, date_guess, fallback_date)

    log.debug("add story: %s; %s; %s; %d" %
              (story['title'], story['url'], story['publish_date'],
               story['stories_id']))

    if story.get('is_new', False):
        download = create_download_for_new_story(db, story, feed)
        store_content(db, download, content)
        _extract_story(story)

    return story
Exemple #10
0
def generate_story(db: DatabaseHandler,
                   url: str,
                   content: str,
                   title: str = None,
                   publish_date: str = None,
                   fallback_date: Optional[str] = None) -> dict:
    """Add a new story to the database by guessing metadata using the given url and content.

    This function guesses the medium, feed, title, and date of the story from the url and content.

    If inserting the story results in a unique constraint error based on media_id and url, return
    the existing story instead.

    Arguments:
    db - db handle
    url - story url
    content - story content
    fallback_date - fallback to this date if the date guesser fails to find a date
    """
    if len(url) < 1:
        raise McTMStoriesException("url must not be an empty string")

    log.debug(f"Generating story from URL {url}...")

    url = url[0:MAX_URL_LENGTH]

    log.debug(f"Guessing medium for URL {url}...")
    medium = guess_medium(db, url)
    log.debug(f"Done guessing medium for URL {url}: {medium}")

    log.debug(f"Getting spider feed for medium {medium}...")
    feed = get_spider_feed(db, medium)
    log.debug(f"Done getting spider feed for medium {medium}: {feed}")

    log.debug(f"Getting spidered tag...")
    spidered_tag = get_spidered_tag(db)
    log.debug(f"Done getting spidered tag: {spidered_tag}")

    if title is None:
        log.debug(f"Parsing HTML title...")
        title = html_title(content, url, MAX_TITLE_LENGTH)
        log.debug(f"Done parsing HTML title: {title}")

    story = {
        'url': url,
        'guid': url,
        'media_id': medium['media_id'],
        'title': title,
        'description': ''
    }

    # postgres refuses to insert text values with the null character
    for field in ('url', 'guid', 'title'):
        story[field] = re2.sub('\x00', '', story[field])

    date_guess = None
    if publish_date is None:
        log.debug(f"Guessing date for URL {url}...")
        date_guess = guess_date(url, content)
        log.debug(f"Done guessing date for URL {url}: {date_guess}")

        story['publish_date'] = date_guess.date if date_guess.found else None
    else:
        story['publish_date'] = publish_date

    log.debug(f"Adding story {story}...")
    story = add_story(db, story, feed['feeds_id'])
    log.debug(f"Done adding story {story}")

    db.query(
        """
        insert into stories_tags_map (stories_id, tags_id)
            select %(a)s, %(b)s
            where not exists (
                select 1
                from stories_tags_map
                where stories_id = %(a)s
                  and tags_id = %(b)s
            )
        """, {
            'a': story['stories_id'],
            'b': spidered_tag['tags_id']
        })

    if publish_date is None:
        log.debug(f"Assigning date guess tag...")
        assign_date_guess_tag(db, story, date_guess, fallback_date)

    log.debug("add story: %s; %s; %s; %d" %
              (story['title'], story['url'], story['publish_date'],
               story['stories_id']))

    if story.get('is_new', False):
        log.debug("Story is new, creating download...")
        download = create_download_for_new_story(db, story, feed)

        log.debug("Storing story content...")
        store_and_verify_content(db, download, content)

        log.debug("Extracting story...")
        _extract_story(db, story)
        log.debug("Done extracting story")

    else:
        log.debug("Story is not new, skipping download storage and extraction")

    log.debug(f"Done generating story from URL {url}")

    return story