Ejemplo n.º 1
0
def _create_child_download_for_story(db: DatabaseHandler, story: dict, parent_download: dict) -> None:
    """Create a pending download for the story's URL."""
    story = decode_object_from_bytes_if_needed(story)
    parent_download = decode_object_from_bytes_if_needed(parent_download)

    download = {
        'feeds_id': parent_download['feeds_id'],
        'stories_id': story['stories_id'],
        'parent': parent_download['downloads_id'],
        'url': story['url'],
        'host': get_url_host(story['url']),
        'type': 'content',
        'sequence': 1,
        'state': 'pending',
        'priority': parent_download['priority'],
        'extracted': False,
    }

    content_delay = db.query("""
        SELECT content_delay
        FROM media
        WHERE media_id = %(media_id)s
    """, {'media_id': story['media_id']}).flat()[0]
    if content_delay:
        # Delay download of content this many hours. his is useful for sources that are likely to significantly change
        # content in the hours after it is first published.
        now = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
        download_at_timestamp = now + (content_delay * 60 * 60)
        download['download_time'] = get_sql_date_from_epoch(download_at_timestamp)

    db.create(table='downloads', insert_hash=download)
Ejemplo n.º 2
0
def create_test_story(db: DatabaseHandler, label: str, feed: dict) -> dict:
    """Create test story with a simple label belonging to feed."""

    label = decode_object_from_bytes_if_needed(label)
    feed = decode_object_from_bytes_if_needed(feed)

    story = db.create(table='stories',
                      insert_hash={
                          'media_id': int(feed['media_id']),
                          'url': "http://story.test/%s" % label,
                          'guid': "guid://story.test/%s" % label,
                          'title': "story %s" % label,
                          'description': "description %s" % label,
                          'publish_date': '2016-10-15 08:00:00',
                          'collect_date': '2016-10-15 10:00:00',
                          'full_text_rss': True,
                      })

    db.create(table='feeds_stories_map',
              insert_hash={
                  'feeds_id': int(feed['feeds_id']),
                  'stories_id': int(story['stories_id']),
              })

    return story
Ejemplo n.º 3
0
def validate_remote_integration(db: DatabaseHandler, source: str, query: str,
                                day: str) -> None:
    """Run sanity test on remote APIs."""

    topic = create_test_topic(db, "test_remote_integration")

    tsq = {
        'topics_id': topic['topics_id'],
        'platform': 'twitter',
        'source': source,
        'query': query
    }
    db.create('topic_seed_queries', tsq)

    topic['platform'] = 'twitter'
    topic['pattern'] = '.*'
    topic['start_date'] = day
    topic['end_date'] = day
    topic['mode'] = 'url_sharing'
    db.update_by_id('topics', topic['topics_id'], topic)

    fetch_topic_posts(db, topic['topics_id'])

    got_tts = db.query("select * from topic_posts").hashes()

    # for old ch monitors, lots of the posts may be deleted
    assert len(got_tts) > 20

    assert len(got_tts[0]['content']) > MIN_TEST_POST_LENGTH
    assert len(got_tts[0]['author']) > MIN_TEST_AUTHOR_LENGTH
Ejemplo n.º 4
0
def _add_timespans_to_stories(db: DatabaseHandler,
                              stories: List[Dict[str, Any]]) -> None:
    """Add timespans to stories for solr indexing."""
    stories = decode_object_from_bytes_if_needed(stories)

    topic = create_test_topic(db=db, label="solr dump test")

    snapshot = db.create(table='snapshots',
                         insert_hash={
                             'topics_id': topic['topics_id'],
                             'snapshot_date': '2018-01-01',
                             'start_date': '2018-01-01',
                             'end_date': '2018-01-01',
                         })

    timespans = []
    for i in range(1, 5 + 1):
        timespan = db.create(table='timespans',
                             insert_hash={
                                 'topics_id': topic['topics_id'],
                                 'snapshots_id': snapshot['snapshots_id'],
                                 'start_date': '2018-01-01',
                                 'end_date': '2018-01-01',
                                 'story_count': 1,
                                 'story_link_count': 1,
                                 'medium_count': 1,
                                 'medium_link_count': 1,
                                 'post_count': 1,
                                 'period': 'overall',
                             })
        timespans.append(timespan)

    for story in stories:
        assert isinstance(story, dict)

        timespan = timespans.pop()
        timespans.insert(0, timespan)

        db.query(
            """
            INSERT INTO snap.story_link_counts (
                topics_id,
                timespans_id,
                stories_id,
                media_inlink_count,
                inlink_count,
                outlink_count
            ) VALUES (
                %(topics_id)s,
                %(timespans_id)s,
                %(stories_id)s,
                1,
                1,
                1
            )
        """, {
                'topics_id': timespan['topics_id'],
                'timespans_id': timespan['timespans_id'],
                'stories_id': story['stories_id'],
            })
Ejemplo n.º 5
0
def merge_foreign_rss_stories(db: DatabaseHandler, topic: dict) -> None:
    """Move all topic stories with a foreign_rss_links medium from topic_stories back to topic_seed_urls."""
    topic = decode_object_from_bytes_if_needed(topic)

    stories = db.query(
        """
        select s.*
        from stories s, topic_stories ts, media m
        where s.stories_id = ts.stories_id
          and s.media_id = m.media_id
          and m.foreign_rss_links = true
          and ts.topics_id = %(a)s
          and not ts.valid_foreign_rss_story
        """, {
            'a': topic['topics_id']
        }).hashes()

    for story in stories:
        download = db.query(
            "select * from downloads where stories_id = %(a)s order by downloads_id limit 1",
            {
                'a': story['stories_id']
            }).hash()

        content = ''
        try:
            content = fetch_content(db, download)
        except Exception as ex:
            log.warning(
                f"Unable to fetch content for download {download['downloads_id']}: {ex}"
            )

        # postgres will complain if the content has a null in it
        content = content.replace('\x00', '')

        db.begin()
        db.create(
            'topic_seed_urls', {
                'url': story['url'],
                'topics_id': topic['topics_id'],
                'source': 'merge_foreign_rss_stories',
                'content': content
            })

        db.query(
            """
            update topic_links set ref_stories_id = null, link_spidered = 'f'
                where topics_id = %(b)s and ref_stories_id = %(a)s
            """, {
                'a': story['stories_id'],
                'b': topic['topics_id']
            })

        db.query(
            "delete from topic_stories where stories_id = %(a)s and topics_id = %(b)s",
            {
                'a': story['stories_id'],
                'b': topic['topics_id']
            })
        db.commit()
Ejemplo n.º 6
0
def get_consistent_color(db: DatabaseHandler, item_set: str,
                         item_id: str) -> str:
    """Return the same hex color (e.g. "ff0000" for the same set / ID combination every time this function is called."""
    item_set = decode_object_from_bytes_if_needed(item_set)
    item_id = decode_object_from_bytes_if_needed(item_id)

    # Always return grey for null or not typed values
    if item_id.lower() in {'null', 'not typed'}:
        return '999999'

    color = db.query(
        """SELECT color FROM color_sets WHERE color_set = %(item_set)s AND id = %(item_id)s""",
        {
            'item_set': item_set,
            'item_id': item_id,
        }).flat()
    if color is not None and len(color):
        if isinstance(color, list):
            color = color[0]
        return color

    set_colors = db.query(
        """SELECT color FROM color_sets WHERE color_set = %(item_set)s""", {
            'item_set': item_set,
        }).flat()
    if set_colors is not None:
        if not isinstance(set_colors, list):
            set_colors = [set_colors]

    existing_colors = set()

    if set_colors is not None:
        for color in set_colors:
            existing_colors.add(color)

    # Use the hard coded palette of 25 colors if possible
    new_color = None
    for color in __MC_COLORS:
        if color not in existing_colors:
            new_color = color
            break

    # Otherwise, just generate a random color
    if new_color is None:
        colors = analogous_color(color='0000ff',
                                 return_slices=256,
                                 split_slices=255)
        new_color = random.choice(colors)

    db.create(table='color_sets',
              insert_hash={
                  'color_set': item_set,
                  'id': item_id,
                  'color': new_color,
              })

    return new_color
Ejemplo n.º 7
0
def extract_links_for_topic_story(db: DatabaseHandler, story: dict,
                                  topic: dict) -> None:
    """
    Extract links from a story and insert them into the topic_links table for the given topic.

    After the story is processed, set topic_stories.spidered to true for that story.  Calls get_links_from_story
    on each story.

    Almost all errors are caught by this function saved in topic_stories.link_mine_error.  In the case of an error
    topic_stories.link_mined is also set to true.

    Arguments:
    db - db handle
    story - story dict from db
    topic - topic dict from db

    Returns:
    None

    """
    try:
        log.info("mining %s %s for topic %s .." %
                 (story['title'], story['url'], topic['name']))
        links = get_links_from_story(db, story)

        for link in links:
            if mediawords.tm.domains.skip_self_linked_domain_url(
                    db, topic['topics_id'], story['url'], link):
                log.info("skipping self linked domain url...")
                continue

            topic_link = {
                'topics_id': topic['topics_id'],
                'stories_id': story['stories_id'],
                'url': link
            }

            db.create('topic_links', topic_link)
            mediawords.tm.domains.increment_domain_links(db, topic_link)

        link_mine_error = ''
    except Exception:
        link_mine_error = traceback.format_exc()

    db.query(
        """
        update topic_stories set link_mined = 't', link_mine_error = %(c)s
            where stories_id = %(a)s and topics_id = %(b)s
        """, {
            'a': story['stories_id'],
            'b': topic['topics_id'],
            'c': link_mine_error
        })
Ejemplo n.º 8
0
def get_consistent_color(db: DatabaseHandler, item_set: str, item_id: str) -> str:
    """Return the same hex color (e.g. "ff0000" for the same set / ID combination every time this function is called."""
    item_set = decode_object_from_bytes_if_needed(item_set)
    item_id = decode_object_from_bytes_if_needed(item_id)

    # Always return grey for null or not typed values
    if item_id.lower() in {'null', 'not typed'}:
        return '999999'

    color = db.query("""SELECT color FROM color_sets WHERE color_set = %(item_set)s AND id = %(item_id)s""", {
        'item_set': item_set,
        'item_id': item_id,
    }).flat()
    if color is not None and len(color):
        if isinstance(color, list):
            color = color[0]
        return color

    set_colors = db.query("""SELECT color FROM color_sets WHERE color_set = %(item_set)s""", {
        'item_set': item_set,
    }).flat()
    if set_colors is not None:
        if not isinstance(set_colors, list):
            set_colors = [set_colors]

    existing_colors = set()

    if set_colors is not None:
        for color in set_colors:
            existing_colors.add(color)

    # Use the hard coded palette of 25 colors if possible
    new_color = None
    for color in __MC_COLORS:
        if color not in existing_colors:
            new_color = color
            break

    # Otherwise, just generate a random color
    if new_color is None:
        colors = analogous_color(color='0000ff', return_slices=256, split_slices=255)
        new_color = random.choice(colors)

    db.create(table='color_sets', insert_hash={
        'color_set': item_set,
        'id': item_id,
        'color': new_color,
    })

    return new_color
Ejemplo n.º 9
0
def add_story(db: DatabaseHandler, story: dict, feeds_id: int, skip_checking_if_new: bool = False) -> Optional[dict]:
    """If the story is new, add story to the database with the feed of the download as story feed.

    Returns created story or None if story wasn't created.
    """

    story = decode_object_from_bytes_if_needed(story)
    if isinstance(feeds_id, bytes):
        feeds_id = decode_object_from_bytes_if_needed(feeds_id)
    feeds_id = int(feeds_id)
    if isinstance(skip_checking_if_new, bytes):
        skip_checking_if_new = decode_object_from_bytes_if_needed(skip_checking_if_new)
    skip_checking_if_new = bool(int(skip_checking_if_new))

    if db.in_transaction():
        raise McAddStoryException("add_story() can't be run from within transaction.")

    db.begin()

    db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE")

    if not skip_checking_if_new:
        if not is_new(db=db, story=story):
            log.debug("Story '{}' is not new.".format(story['url']))
            db.commit()
            return None

    medium = db.find_by_id(table='media', object_id=story['media_id'])

    if story.get('full_text_rss', None) is None:
        story['full_text_rss'] = medium.get('full_text_rss', False) or False
        if len(story.get('description', '')) == 0:
            story['full_text_rss'] = False

    try:
        story = db.create(table='stories', insert_hash=story)
    except Exception as ex:
        db.rollback()

        # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint
        if 'unique constraint \"stories_guid' in str(ex):
            log.warning(
                "Failed to add story for '{}' to GUID conflict (guid = '{}')".format(story['url'], story['guid'])
            )
            return None

        else:
            raise McAddStoryException("Error adding story: {}\nStory: {}".format(str(ex), str(story)))

    db.find_or_create(
        table='feeds_stories_map',
        insert_hash={
            'stories_id': story['stories_id'],
            'feeds_id': feeds_id,
        }
    )

    db.commit()

    return story
Ejemplo n.º 10
0
def _store_tweet_and_urls(db: DatabaseHandler, topic_tweet_day: dict, ch_post: dict) -> None:
    """
    Store the tweet in topic_tweets and its urls in topic_tweet_urls, using the data in ch_post.

    Arguments:
    db - database handler
    topic - topic dict
    topic_tweet_day - topic_tweet_day dict
    ch_post - ch_post dict

    Return:
    None
    """
    data_json = mediawords.util.parse_json.encode_json(ch_post)

    # null characters are not legal in json but for some reason get stuck in these tweets
    data_json = data_json.replace('\x00', '')

    topic_tweet = {
        'topic_tweet_days_id': topic_tweet_day['topic_tweet_days_id'],
        'data': data_json,
        'content': ch_post['tweet']['text'],
        'tweet_id': ch_post['tweet_id'],
        'publish_date': ch_post['tweet']['created_at'],
        'twitter_user': ch_post['tweet']['user']['screen_name']
    }

    topic_tweet = db.create('topic_tweets', topic_tweet)

    urls = mediawords.util.twitter.get_tweet_urls(ch_post['tweet'])
    _insert_tweet_urls(db, topic_tweet, urls)
Ejemplo n.º 11
0
def extract_links_for_topic_story(db: DatabaseHandler, story: dict, topic: dict) -> None:
    """
    Extract links from a story and insert them into the topic_links table for the given topic.

    After the story is processed, set topic_stories.spidered to true for that story.  Calls get_links_from_story
    on each story.

    Almost all errors are caught by this function saved in topic_stories.link_mine_error.  In the case of an error
    topic_stories.link_mined is also set to true.

    Arguments:
    db - db handle
    story - story dict from db
    topic - topic dict from db

    Returns:
    None

    """
    try:
        log.info("mining %s %s for topic %s .." % (story['title'], story['url'], topic['name']))
        links = get_links_from_story(db, story)

        for link in links:
            if mediawords.tm.domains.skip_self_linked_domain_url(db, topic['topics_id'], story['url'], link):
                log.info("skipping self linked domain url...")
                continue

            topic_link = {
                'topics_id': topic['topics_id'],
                'stories_id': story['stories_id'],
                'url': link
            }

            db.create('topic_links', topic_link)
            mediawords.tm.domains.increment_domain_links(db, topic_link)

        link_mine_error = ''
    except Exception:
        link_mine_error = traceback.format_exc()

    db.query(
        """
        update topic_stories set link_mined = 't', link_mine_error = %(c)s
            where stories_id = %(a)s and topics_id = %(b)s
        """,
        {'a': story['stories_id'], 'b': topic['topics_id'], 'c': link_mine_error})
Ejemplo n.º 12
0
def merge_foreign_rss_stories(db: DatabaseHandler, topic: dict) -> None:
    """Move all topic stories with a foreign_rss_links medium from topic_stories back to topic_seed_urls."""
    topic = decode_object_from_bytes_if_needed(topic)

    stories = db.query(
        """
        select s.*
            from stories s, topic_stories ts, media m
            where
                s.stories_id = ts.stories_id and
                s.media_id = m.media_id and
                m.foreign_rss_links = true and
                ts.topics_id = %(a)s and
                not ts.valid_foreign_rss_story
        """, {
            'a': topic['topics_id']
        }).hashes()

    for story in stories:
        download = db.query(
            "select * from downloads where stories_id = %(a)s order by downloads_id limit 1",
            {
                'a': story['stories_id']
            }).hash()

        content = ''
        try:
            content = mediawords.dbi.downloads.fetch_content(db, download)
        except Exception:
            pass

        db.begin()
        db.create(
            'topic_seed_urls', {
                'url': story['url'],
                'topics_id': topic['topics_id'],
                'source': 'merge_foreign_rss_stories',
                'content': content
            })

        db.query(
            "delete from topic_stories where stories_id = %(a)s and topics_id = %(b)s",
            {
                'a': story['stories_id'],
                'b': topic['topics_id']
            })
        db.commit()
Ejemplo n.º 13
0
def import_feed_downloads(db: DatabaseHandler, csv_file: str) -> None:
    log.info(f"Importing downloads from {csv_file}...")

    db.begin()

    with open(csv_file, mode='r', encoding='utf-8') as f:

        # Guess dialect
        sample = f.read(1024)
        sniffer = csv.Sniffer()
        dialect = sniffer.sniff(sample)
        f.seek(0)

        input_csv = csv.DictReader(f, dialect=dialect)

        n = 1
        for download in input_csv:
            log.info(f"Importing download {n}...")
            n += 1

            raw_download_content = download.get('_raw_download_content', None)
            if raw_download_content:
                del raw_download_content['_raw_download_content']

                # Cast some columns
                download['feeds_id'] = int(
                    download.get['feeds_id']
                ) if 'feeds_id' in download else None  # NULL
                download['stories_id'] = int(
                    download.get['stories_id']
                ) if 'stories_id' in download else None  # NULL
                download['parent'] = int(
                    download.get['parent']
                ) if 'parent' in download else None  # NULL
                download['priority'] = int(
                    download.get['priority']
                ) if 'priority' in download else 0  # NOT NULL
                download['sequence'] = int(
                    download.get['sequence']
                ) if 'sequence' in download else 0  # NOT NULL
                download['sequence'] = 't' if download.get('extracted',
                                                           False) else 'f'

                # Will be rewritten by handle_download()
                download['path'] = ''

                download = db.create(table='downloads', insert_hash=download)

                # Create mock response to import it
                response = FakeResponse(content=raw_download_content)
                handler = handler_for_download(db=db, download=download)
                handler.store_response(db=db,
                                       download=download,
                                       response=response)

    log.info("Committing...")
    db.commit()

    log.info(f"Done importing downloads from {csv_file}")
Ejemplo n.º 14
0
def _add_topic_post_single_day(db: DatabaseHandler, topic_seed_query: dict, num_posts: int, day: datetime) -> dict:
    """
    Add a row to topic_post_day if it does not already exist.

    Arguments:
    db - database handle
    topic_seed_query - topic_seed_query dict
    day - date to fetch eg '2017-12-30'
    num_posts - number of posts found for that day

    Return:
    None
    """
    # the perl-python layer was segfaulting until I added the str() around day below -hal
    topic_post_day = db.query(
        """
        SELECT *
        FROM topic_post_days
        WHERE
            topics_id = %(topics_id)s AND
            topic_seed_queries_id = %(topic_seed_queries_id)s AND
            day = %(day)s
        """, {
            'topics_id': topic_seed_query['topics_id'],
            'topic_seed_queries_id': topic_seed_query['topic_seed_queries_id'],
            'day': str(day),
        }
    ).hash()

    if topic_post_day is not None and topic_post_day['posts_fetched']:
        raise McFetchTopicPostsDataException("tweets already fetched for day " + str(day))

    # if we have a ttd but had not finished fetching tweets, delete it and start over
    if topic_post_day is not None:
        db.query(
            """
            DELETE FROM topic_post_days
            WHERE
                topics_id = %(topics_id)s AND
                topic_post_days_id = %(topic_post_days_id)s
            """, {
                'topics_id': topic_post_day['topics_id'],
                'topic_post_days_id': topic_post_day['topic_post_days_id'],
            }
        )

    topic_post_day = db.create(
        'topic_post_days',
        {
            'topics_id': topic_seed_query['topics_id'],
            'topic_seed_queries_id': topic_seed_query['topic_seed_queries_id'],
            'day': day,
            'num_posts_stored': num_posts,
            'num_posts_fetched': num_posts,
            'posts_fetched': False
        })

    return topic_post_day
Ejemplo n.º 15
0
def _store_tweet_and_urls(db: DatabaseHandler, topic_tweet_day: dict, ch_post: dict) -> None:
    """
    Store the tweet in topic_tweets and its urls in topic_tweet_urls, using the data in ch_post.

    Arguments:
    db - database handler
    topic - topic dict
    topic_tweet_day - topic_tweet_day dict
    ch_post - ch_post dict

    Return:
    None
    """
    data_json = mediawords.util.parse_json.encode_json(ch_post)

    # null characters are not legal in json but for some reason get stuck in these tweets
    data_json = data_json.replace(u'\u0000', '')

    topic_tweet = {
        'topic_tweet_days_id': topic_tweet_day['topic_tweet_days_id'],
        'data': data_json,
        'content': ch_post['tweet']['text'],
        'tweet_id': ch_post['tweet_id'],
        'publish_date': ch_post['tweet']['created_at'],
        'twitter_user': ch_post['tweet']['user']['screen_name']
    }

    topic_tweet = db.create('topic_tweets', topic_tweet)

    urls_inserted = {}  # type:typing.Dict[str, bool]
    for url_data in ch_post['tweet']['entities']['urls']:

        url = url_data['expanded_url']

        if url in urls_inserted:
            break

        urls_inserted[url] = True

        db.create(
            'topic_tweet_urls',
            {
                'topic_tweets_id': topic_tweet['topic_tweets_id'],
                'url': url[0:1024]
            })
Ejemplo n.º 16
0
def _store_post_and_urls(db: DatabaseHandler, topic_post_day: dict, post: dict) -> None:
    """
    Store the tweet in topic_posts and its urls in topic_post_urls, using the data in post.

    Arguments:
    db - database handler
    topic - topic dict
    topic_post_day - topic_post_day dict
    post - post dict

    Return:
    None
    """
    log.debug("remove nulls")
    _remove_json_tree_nulls(post)

    log.debug("encode json")
    data_json = encode_json(post)

    # null characters are not legal in json but for some reason get stuck in these tweets
    # data_json = data_json.replace('\x00', '')

    data = {}

    for field in POST_FIELDS:
        data[field] = post.get(field, None)

    data['topics_id'] = topic_post_day['topics_id']
    data['topic_post_days_id'] = topic_post_day['topic_post_days_id']
    data['data'] = data_json

    topic_post = db.query(
        """
        SELECT *
        FROM topic_posts
        WHERE
            topics_id = %(topics_id)s AND
            topic_post_days_id = %(topic_post_days_id)s AND
            post_id = %(post_id)s::TEXT
        """, {
            'topics_id': topic_post_day['topics_id'],
            'topic_post_days_id': topic_post_day['topic_post_days_id'],
            'post_id': data['post_id'],
        }
    ).hash()

    if not topic_post:
        log.debug("insert topic post")
        topic_post = db.create('topic_posts', data)

    log.debug("insert tweet urls")
    _insert_post_urls(db, topic_post, post['urls'])

    log.debug("done")
Ejemplo n.º 17
0
def create_test_feed(db: DatabaseHandler, label: str, medium: dict) -> dict:
    """Create test feed with a simple label belonging to medium."""

    label = decode_object_from_bytes_if_needed(label)
    medium = decode_object_from_bytes_if_needed(medium)

    return db.create(table='feeds',
                     insert_hash={
                         'name': label,
                         'url': "http://feed.test/%s" % label,
                         'media_id': int(medium['media_id']),
                     })
Ejemplo n.º 18
0
def merge_foreign_rss_stories(db: DatabaseHandler, topic: dict) -> None:
    """Move all topic stories with a foreign_rss_links medium from topic_stories back to topic_seed_urls."""
    topic = decode_object_from_bytes_if_needed(topic)

    stories = db.query(
        """
        select s.*
            from stories s, topic_stories ts, media m
            where
                s.stories_id = ts.stories_id and
                s.media_id = m.media_id and
                m.foreign_rss_links = true and
                ts.topics_id = %(a)s and
                not ts.valid_foreign_rss_story
        """,
        {'a': topic['topics_id']}).hashes()

    for story in stories:
        download = db.query(
            "select * from downloads where stories_id = %(a)s order by downloads_id limit 1",
            {'a': story['stories_id']}).hash()

        content = ''
        try:
            content = mediawords.dbi.downloads.fetch_content(db, download)
        except Exception:
            pass

        db.begin()
        db.create('topic_seed_urls', {
            'url': story['url'],
            'topics_id': topic['topics_id'],
            'source': 'merge_foreign_rss_stories',
            'content': content
        })

        db.query(
            "delete from topic_stories where stories_id = %(a)s and topics_id = %(b)s",
            {'a': story['stories_id'], 'b': topic['topics_id']})
        db.commit()
Ejemplo n.º 19
0
def _add_topic_tweet_single_day(
        db: DatabaseHandler, topic: dict, day: datetime.datetime,
        ch_class: typing.Type[AbstractCrimsonHexagon]) -> dict:
    """
    Add a row to topic_tweet_day if it does not already exist.  fetch data for new row from CH.

    Arguments:
    db - database handle
    topic - topic dict
    day - date to fetch eg '2017-12-30'
    ch_class - AbstractCrimsonHexagon class

    Return:
    None
    """
    # the perl-python layer was segfaulting until I added the str() around day below -hal
    topic_tweet_day = db.query(
        "select * from topic_tweet_days where topics_id = %(a)s and day = %(b)s",
        {
            'a': topic['topics_id'],
            'b': str(day)
        }).hash()

    if topic_tweet_day is not None and topic_tweet_day['tweets_fetched']:
        raise McFetchTopicTweetDateFetchedException(
            "tweets already fetched for day " + str(day))

    # if we have a ttd but had not finished fetching tweets, delete it and start over
    if topic_tweet_day is not None:
        db.delete_by_id('topic_tweet_days',
                        topic_tweet_day['topic_tweet_days_id'])

    ch_posts = ch_class.fetch_posts(topic['ch_monitor_id'], day)

    tweet_count = ch_posts['totalPostsAvailable']

    num_ch_tweets = len(ch_posts['posts'])

    topic_tweet_day = db.create(
        'topic_tweet_days', {
            'topics_id': topic['topics_id'],
            'day': day,
            'tweet_count': tweet_count,
            'num_ch_tweets': num_ch_tweets,
            'tweets_fetched': False
        })

    topic_tweet_day['ch_posts'] = ch_posts

    return topic_tweet_day
Ejemplo n.º 20
0
def create_test_medium(db: DatabaseHandler, label: str) -> dict:
    """Create test medium with a simple label."""

    label = decode_object_from_bytes_if_needed(label)

    return db.create(table='media',
                     insert_hash={
                         'name': label,
                         'url': "http://media.test/%s" % label,
                         'moderated': True,
                         'is_monitored': True,
                         'public_notes': "%s public notes" % label,
                         'editor_notes': "%s editor notes" % label,
                     })
Ejemplo n.º 21
0
def extract_links_for_topic_story(db: DatabaseHandler, story: dict,
                                  topic: dict) -> None:
    """
    Extract links from a story and insert them into the topic_links table for the given topic.

    After the story is processed, set topic_stories.spidered to true for that story.  Calls get_links_from_story
    on each story.

    Arguments:
    db - db handle
    story - story dict from db
    topic - topic dict from db

    Returns:
    None

    """
    log.info("mining %s %s for topic %s .." %
             (story['title'], story['url'], topic['name']))

    links = get_links_from_story(db, story)

    for link in links:
        topic_link = {
            'topics_id': topic['topics_id'],
            'stories_id': story['stories_id'],
            'url': link
        }

        db.create('topic_links', topic_link)

    db.query(
        "update topic_stories set link_mined = 't' where stories_id = %(a)s and topics_id = %(b)s",
        {
            'a': story['stories_id'],
            'b': topic['topics_id']
        })
Ejemplo n.º 22
0
def _create_child_download_for_story(db: DatabaseHandler, story: dict,
                                     parent_download: dict) -> None:
    """Create a pending download for the story's URL."""
    story = decode_object_from_bytes_if_needed(story)
    parent_download = decode_object_from_bytes_if_needed(parent_download)

    download = {
        'feeds_id': parent_download['feeds_id'],
        'stories_id': story['stories_id'],
        'parent': parent_download['downloads_id'],
        'url': story['url'],
        'host': get_url_host(story['url']),
        'type': 'content',
        'sequence': 1,
        'state': 'pending',
        'priority': parent_download['priority'],
        'extracted': False,
    }

    content_delay = db.query(
        """
        SELECT content_delay
        FROM media
        WHERE media_id = %(media_id)s
    """, {
            'media_id': story['media_id']
        }).flat()[0]
    if content_delay:
        # Delay download of content this many hours. his is useful for sources that are likely to significantly change
        # content in the hours after it is first published.
        now = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
        download_at_timestamp = now + (content_delay * 60 * 60)
        download['download_time'] = get_sql_date_from_epoch(
            download_at_timestamp)

    db.create(table='downloads', insert_hash=download)
Ejemplo n.º 23
0
def _add_topic_tweet_single_day(
        db: DatabaseHandler,
        topic: dict,
        day: datetime.datetime,
        ch_class: typing.Type[AbstractCrimsonHexagon]) -> dict:
    """
    Add a row to topic_tweet_day if it does not already exist.  fetch data for new row from CH.

    Arguments:
    db - database handle
    topic - topic dict
    day - date to fetch eg '2017-12-30'
    ch_class - AbstractCrimsonHexagon class

    Return:
    None
    """
    # the perl-python layer was segfaulting until I added the str() around day below -hal
    topic_tweet_day = db.query(
        "select * from topic_tweet_days where topics_id = %(a)s and day = %(b)s",
        {'a': topic['topics_id'], 'b': str(day)}).hash()

    if topic_tweet_day is not None and topic_tweet_day['tweets_fetched']:
        raise McFetchTopicTweetDateFetchedException("tweets already fetched for day " + str(day))

    # if we have a ttd but had not finished fetching tweets, delete it and start over
    if topic_tweet_day is not None:
        db.delete_by_id('topic_tweet_days', topic_tweet_day['topic_tweet_days_id'])

    ch_posts = ch_class.fetch_posts(topic['ch_monitor_id'], day)

    tweet_count = ch_posts['totalPostsAvailable']

    num_ch_tweets = len(ch_posts['posts'])

    topic_tweet_day = db.create(
        'topic_tweet_days',
        {
            'topics_id': topic['topics_id'],
            'day': day,
            'tweet_count': tweet_count,
            'num_ch_tweets': num_ch_tweets,
            'tweets_fetched': False
        })

    topic_tweet_day['ch_posts'] = ch_posts

    return topic_tweet_day
Ejemplo n.º 24
0
def create_test_topic(db: DatabaseHandler, label: str) -> dict:
    """Create test topic with a simple label."""

    label = decode_object_from_bytes_if_needed(label)

    return db.create(table='topics',
                     insert_hash={
                         'name': label,
                         'description': label,
                         'pattern': label,
                         'solr_seed_query': label,
                         'solr_seed_query_run': True,
                         'start_date': '2016-01-01',
                         'end_date': '2016-03-01',
                         'job_queue': 'mc',
                         'max_stories': 100000,
                     })
Ejemplo n.º 25
0
def _create_queued_job_state(db: DatabaseHandler, queue_name: str, args: Dict[str, Any]) -> Dict[str, Any]:
    """Create the initial entry in the "job_states" table with a state of 'queued' and return it."""
    queue_name = decode_object_from_bytes_if_needed(queue_name)
    args = decode_object_from_bytes_if_needed(args)

    args_json = encode_json(args)

    state = db.create(table='job_states', insert_hash={
        'state': STATE_QUEUED,
        'args': args_json,
        'priority': 'normal',
        'class': queue_name,
        'process_id': os.getpid(),
        'hostname': socket.gethostname(),
    })

    return state
Ejemplo n.º 26
0
def _store_map(db: DatabaseHandler,
        topics_id: int,
        timespans_id: int,
        content: bytes,
        graph_format: str,
        color_by: str) -> None:
    """Create a timespans_map row."""
    db.begin()

    options = {'color_by': color_by}
    options_json = encode_json(options)

    db.query(
        """
            DELETE FROM timespan_maps
            WHERE timespans_id = %(a)s
              AND format = %(b)s
              AND options = %(c)s
        """,
        {'a': timespans_id, 'b': graph_format, 'c': options_json}
    )

    timespan_map = {
        'topics_id': topics_id,
        'timespans_id': timespans_id,
        'options': options_json,
        'format': graph_format
    }
    timespan_map = db.create('timespan_maps', timespan_map)

    db.commit()

    content_types = {
        'svg': 'image/svg+xml',
        'gexf': 'xml/gexf'
    }
    content_type = content_types[graph_format]

    store_content(db, TIMESPAN_MAPS_TYPE, timespan_map['timespan_maps_id'], content, content_type)

    url = get_content_url(db, TIMESPAN_MAPS_TYPE, timespan_map['timespan_maps_id'])

    db.update_by_id('timespan_maps', timespan_map['timespan_maps_id'], {'url': url})
Ejemplo n.º 27
0
def create_download_for_new_story(db: DatabaseHandler, story: dict, feed: dict) -> dict:
    """Create and return download object in database for the new story."""

    download = {
        'feeds_id': feed['feeds_id'],
        'stories_id': story['stories_id'],
        'url': story['url'],
        'host': mediawords.util.url.get_url_host(story['url']),
        'type': 'content',
        'sequence': 1,
        'state': 'success',
        'path': 'content:pending',
        'priority': 1,
        'extracted': 'f'
    }

    download = db.create('downloads', download)

    return download
Ejemplo n.º 28
0
def create_download_for_new_story(db: DatabaseHandler, story: dict, feed: dict) -> dict:
    """Create and return download object in database for the new story."""

    download = {
        'feeds_id': feed['feeds_id'],
        'stories_id': story['stories_id'],
        'url': story['url'],
        'host': mediawords.util.url.get_url_host(story['url']),
        'type': 'content',
        'sequence': 1,
        'state': 'success',
        'path': 'content:pending',
        'priority': 1,
        'extracted': 'f'
    }

    download = db.create('downloads', download)

    return download
Ejemplo n.º 29
0
def _add_topic_tweet_single_day(db: DatabaseHandler, topic: dict,
                                num_tweets: int,
                                day: datetime.datetime) -> dict:
    """
    Add a row to topic_tweet_day if it does not already exist.

    Arguments:
    db - database handle
    topic - topic dict
    day - date to fetch eg '2017-12-30'
    num_tweets - number of tweets found for that day

    Return:
    None
    """
    # the perl-python layer was segfaulting until I added the str() around day below -hal
    topic_tweet_day = db.query(
        "select * from topic_tweet_days where topics_id = %(a)s and day = %(b)s",
        {
            'a': topic['topics_id'],
            'b': str(day)
        }).hash()

    if topic_tweet_day is not None and topic_tweet_day['tweets_fetched']:
        raise McFetchTopicTweetDateFetchedException(
            "tweets already fetched for day " + str(day))

    # if we have a ttd but had not finished fetching tweets, delete it and start over
    if topic_tweet_day is not None:
        db.delete_by_id('topic_tweet_days',
                        topic_tweet_day['topic_tweet_days_id'])

    topic_tweet_day = db.create(
        'topic_tweet_days', {
            'topics_id': topic['topics_id'],
            'day': day,
            'num_tweets': num_tweets,
            'tweets_fetched': False
        })

    return topic_tweet_day
Ejemplo n.º 30
0
def _store_tweet_and_urls(db: DatabaseHandler, topic_tweet_day: dict,
                          ch_post: dict) -> None:
    """
    Store the tweet in topic_tweets and its urls in topic_tweet_urls, using the data in ch_post.

    Arguments:
    db - database handler
    topic - topic dict
    topic_tweet_day - topic_tweet_day dict
    ch_post - ch_post dict

    Return:
    None
    """
    data_json = mediawords.util.parse_json.encode_json(ch_post)

    # null characters are not legal in json but for some reason get stuck in these tweets
    data_json = data_json.replace('\x00', '')

    topic_tweet = {
        'topic_tweet_days_id': topic_tweet_day['topic_tweet_days_id'],
        'data': data_json,
        'content': ch_post['tweet']['text'],
        'tweet_id': ch_post['tweet_id'],
        'publish_date': ch_post['tweet']['created_at'],
        'twitter_user': ch_post['tweet']['user']['screen_name']
    }

    topic_tweet = db.create('topic_tweets', topic_tweet)

    # for some reason I can't figure out, null characters still sneak through the data_json.replace()
    # above, so we have to tell postgres direclty to get rid of them, or else querying the row later
    # will fail
    db.query(
        """
        update topic_tweets set data = regexp_replace(data::text, '\\u0000', '', 'g')::json
            where topic_tweets_id = %(a)s and data::text ~ '\\u0000'
        """, {'a': topic_tweet['topic_tweets_id']})

    urls = mediawords.util.twitter.get_tweet_urls(ch_post['tweet'])
    _insert_tweet_urls(db, topic_tweet, urls)
Ejemplo n.º 31
0
def create_download_for_feed(db: DatabaseHandler, feed: dict) -> dict:
    feed = decode_object_from_bytes_if_needed(feed)

    priority = 0
    if 'last_attempted_download_time' not in feed:
        priority = 10

    host = get_url_host(url=feed['url'])

    return db.create(table='downloads',
                     insert_hash={
                         'feeds_id': int(feed['feeds_id']),
                         'url': feed['url'],
                         'host': host,
                         'type': 'feed',
                         'sequence': 1,
                         'state': 'pending',
                         'priority': priority,
                         'download_time': 'NOW()',
                         'extracted': False,
                     })
Ejemplo n.º 32
0
def _store_post_and_urls(db: DatabaseHandler, topic_post_day: dict,
                         post: dict) -> None:
    """
    Store the tweet in topic_posts and its urls in topic_post_urls, using the data in post.

    Arguments:
    db - database handler
    topic - topic dict
    topic_post_day - topic_post_day dict
    post - post dict

    Return:
    None
    """
    log.debug("remove nulls")
    _remove_json_tree_nulls(post)

    log.debug("encode json")
    data_json = encode_json(post)

    # null characters are not legal in json but for some reason get stuck in these tweets
    # data_json = data_json.replace('\x00', '')

    topic_post = {
        'topic_post_days_id': topic_post_day['topic_post_days_id'],
        'data': data_json
    }

    for field in POST_FIELDS:
        topic_post[field] = post.get(field, None)

    log.debug("insert topic post")
    topic_post = db.create('topic_posts', topic_post)

    urls = _get_post_urls(post)

    log.debug("insert tweet urls")
    _insert_post_urls(db, topic_post, urls)

    log.debug("done")
Ejemplo n.º 33
0
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict,
                             new_medium: dict) -> dict:
    """Copy story to new medium.

    Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on.
    Return the new story.
    """

    story = {
        'url': old_story['url'],
        'media_id': new_medium['media_id'],
        'guid': old_story['guid'],
        'publish_date': old_story['publish_date'],
        'collect_date': sql_now(),
        'description': old_story['description'],
        'title': old_story['title']
    }

    story = db.create('stories', story)
    add_to_topic_stories(db=db,
                         story=story,
                         topic=topic,
                         valid_foreign_rss_story=True)

    db.query(
        """
        insert into stories_tags_map (stories_id, tags_id)
            select %(a)s, stm.tags_id from stories_tags_map stm where stm.stories_id = %(b)s
        """, {
            'a': story['stories_id'],
            'b': old_story['stories_id']
        })

    feed = get_spider_feed(db, new_medium)
    db.create('feeds_stories_map', {
        'feeds_id': feed['feeds_id'],
        'stories_id': story['stories_id']
    })

    old_download = db.query(
        "select * from downloads where stories_id = %(a)s order by downloads_id limit 1",
        {
            'a': old_story['stories_id']
        }).hash()
    download = create_download_for_new_story(db, story, feed)

    if old_download is not None:
        try:
            content = fetch_content(db, old_download)
            download = store_content(db, download, content)
        except (McDBIDownloadsException, McAmazonS3StoreException):
            download_update = dict([
                (f, old_download[f])
                for f in ['state', 'error_message', 'download_time']
            ])
            db.update_by_id('downloads', download['downloads_id'],
                            download_update)

        db.query(
            """
            insert into download_texts (downloads_id, download_text, download_text_length)
                select %(a)s, dt.download_text, dt.download_text_length
                    from download_texts dt
                    where dt.downloads_id = %(a)s
            """, {'a': download['downloads_id']})

    # noinspection SqlInsertValues
    db.query(
        f"""
        insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date, language)
            select {int(story['stories_id'])} as stories_id, sentence_number, sentence, media_id, publish_date, language
                from story_sentences
                where stories_id = %(b)s
        """, {'b': old_story['stories_id']})

    return story
Ejemplo n.º 34
0
def generate_story(
        db: DatabaseHandler,
        url: str,
        content: str,
        title: str = None,
        publish_date: datetime.datetime = None,
        fallback_date: typing.Optional[datetime.datetime] = None) -> dict:
    """Add a new story to the database by guessing metadata using the given url and content.

    This function guesses the medium, feed, title, and date of the story from the url and content.

    If inserting the story results in a unique constraint error based on media_id and url, return
    the existing story instead.

    Arguments:
    db - db handle
    url - story url
    content - story content
    fallback_date - fallback to this date if the date guesser fails to find a date
    """
    if len(url) < 1:
        raise McTMStoriesException("url must not be an empty string")

    url = url[0:mediawords.dbi.stories.stories.MAX_URL_LENGTH]

    medium = mediawords.tm.media.guess_medium(db, url)
    feed = get_spider_feed(db, medium)
    spidered_tag = mediawords.tm.media.get_spidered_tag(db)

    if title is None:
        title = mediawords.util.parse_html.html_title(content, url, mediawords.dbi.stories.stories.MAX_TITLE_LENGTH)

    story = {
        'url': url,
        'guid': url,
        'media_id': medium['media_id'],
        'title': title,
        'description': ''
    }

    # postgres refuses to insert text values with the null character
    for field in ('url', 'guid', 'title'):
        story[field] = re2.sub('\x00', '', story[field])

    if publish_date is None:
        date_guess = guess_date(url, content)
        story['publish_date'] = date_guess.date if date_guess.found else fallback_date
        if story['publish_date'] is None:
            story['publish_date'] = datetime.datetime.now().isoformat()
    else:
        story['publish_date'] = publish_date

    try:
        story = db.create('stories', story)
    except mediawords.db.exceptions.handler.McUniqueConstraintException:
        return mediawords.tm.stories.get_story_match(db=db, url=story['url'])
    except Exception:
        raise McTMStoriesException("Error adding story: %s" % traceback.format_exc())

    db.query(
        "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)",
        {'a': story['stories_id'], 'b': spidered_tag['tags_id']})

    if publish_date is None:
        assign_date_guess_tag(db, story, date_guess, fallback_date)

    log.debug("add story: %s; %s; %s; %d" % (story['title'], story['url'], story['publish_date'], story['stories_id']))

    db.create('feeds_stories_map', {'stories_id': story['stories_id'], 'feeds_id': feed['feeds_id']})

    download = create_download_for_new_story(db, story, feed)

    mediawords.dbi.downloads.store_content(db, download, content)

    _extract_story(db, story)

    return story
Ejemplo n.º 35
0
def login_with_email_password(db: DatabaseHandler, email: str, password: str, ip_address: str = None) -> CurrentUser:
    """Log in with username and password; raise on unsuccessful login."""

    email = decode_object_from_bytes_if_needed(email)
    password = decode_object_from_bytes_if_needed(password)

    if not (email and password):
        raise McAuthLoginException("Email and password must be defined.")

    # Try-except block because we don't want to reveal the specific reason why the login has failed
    try:

        user = user_info(db=db, email=email)

        # Check if user has tried to log in unsuccessfully before and now is trying
        # again too fast
        if __user_is_trying_to_login_too_soon(db=db, email=email):
            raise McAuthLoginException(
                "User '%s' is trying to log in too soon after the last unsuccessful attempt." % email
            )

        if not password_hash_is_valid(password_hash=user.password_hash(), password=password):
            raise McAuthLoginException("Password for user '%s' is invalid." % email)

    except Exception as ex:
        log.info(
            "Login failed for %(email)s, will delay any successive login attempt for %(delay)d seconds: %(exc)s" % {
                'email': email,
                'delay': __POST_UNSUCCESSFUL_LOGIN_DELAY,
                'exc': str(ex),
            }
        )

        # Set the unsuccessful login timestamp
        # (TIMESTAMP 'now' returns "current transaction's start time", so using LOCALTIMESTAMP instead)
        db.query("""
            UPDATE auth_users
            SET last_unsuccessful_login_attempt = LOCALTIMESTAMP
            WHERE email = %(email)s
        """, {'email': email})

        # It might make sense to time.sleep() here for the duration of $POST_UNSUCCESSFUL_LOGIN_DELAY seconds to prevent
        # legitimate users from trying to log in too fast. However, when being actually brute-forced through multiple
        # HTTP connections, this approach might end up creating a lot of processes that would time.sleep() and take up
        # memory.
        #
        # So, let's return the error page ASAP and hope that a legitimate user won't be able to reenter his / her
        # password before the $POST_UNSUCCESSFUL_LOGIN_DELAY amount of seconds pass.

        # Don't give out a specific reason for the user to not be able to find
        # out which user emails are registered
        raise McAuthLoginException("User '%s' was not found or password is incorrect." % email)

    if not user.active():
        raise McAuthLoginException("User with email '%s' is not active." % email)

    # Reset password reset token (if any)
    db.query("""
        UPDATE auth_users
        SET password_reset_token_hash = NULL
        WHERE email = %(email)s
          AND password_reset_token_hash IS NOT NULL
    """, {'email': email})

    if ip_address:
        if not user.api_key_for_ip_address(ip_address):
            db.create(
                table='auth_user_api_keys',
                insert_hash={
                    'auth_users_id': user.user_id(),
                    'ip_address': ip_address,
                })

            # Fetch user again
            user = user_info(db=db, email=email)

            if not user.api_key_for_ip_address(ip_address):
                raise McAuthLoginException("Unable to create per-IP API key for IP %s" % ip_address)

    return user
Ejemplo n.º 36
0
def extract_links_for_topic_story(
    db: DatabaseHandler,
    stories_id: int,
    topics_id: int,
    test_throw_exception: bool = False,
) -> None:
    """
    Extract links from a story and insert them into the topic_links table for the given topic.

    After the story is processed, set topic_stories.spidered to true for that story.  Calls _get_links_from_story()
    on each story.

    Almost all errors are caught by this function saved in topic_stories.link_mine_error.  In the case of an error
    topic_stories.link_mined is also set to true.

    Arguments:
    db - db handle
    story - story dict from db
    topic - topic dict from db

    Returns:
    None

    """
    story = db.require_by_id(table='stories', object_id=stories_id)
    topic = db.require_by_id(table='topics', object_id=topics_id)

    try:
        if test_throw_exception:
            raise McExtractLinksForTopicStoryTestException(
                "Testing whether errors get logged.")

        log.info("mining %s %s for topic %s .." %
                 (story['title'], story['url'], topic['name']))
        links = _get_links_from_story(db, story)

        for link in links:
            if skip_self_linked_domain_url(db, topic['topics_id'],
                                           story['url'], link):
                log.debug("skipping self linked domain url...")
                continue

            topic_link = {
                'topics_id': topic['topics_id'],
                'stories_id': story['stories_id'],
                'url': link
            }

            db.create('topic_links', topic_link)
            increment_domain_links(db, topic_link)

        link_mine_error = ''
    except Exception as ex:
        log.error(f"Link mining error: {ex}")
        link_mine_error = traceback.format_exc()

    db.query(
        """
        update topic_stories set link_mined = 't', link_mine_error = %(c)s
            where stories_id = %(a)s and topics_id = %(b)s
        """, {
            'a': story['stories_id'],
            'b': topic['topics_id'],
            'c': link_mine_error
        })
Ejemplo n.º 37
0
def add_story(db: DatabaseHandler,
              story: dict,
              feeds_id: int,
              skip_checking_if_new: bool = False) -> Optional[dict]:
    """If the story is new, add story to the database with the feed of the download as story feed.

    Returns created story or None if story wasn't created.
    """

    story = decode_object_from_bytes_if_needed(story)
    if isinstance(feeds_id, bytes):
        feeds_id = decode_object_from_bytes_if_needed(feeds_id)
    feeds_id = int(feeds_id)
    if isinstance(skip_checking_if_new, bytes):
        skip_checking_if_new = decode_object_from_bytes_if_needed(
            skip_checking_if_new)
    skip_checking_if_new = bool(int(skip_checking_if_new))

    if db.in_transaction():
        raise McAddStoryException(
            "add_story() can't be run from within transaction.")

    db.begin()

    db.query("LOCK TABLE stories IN ROW EXCLUSIVE MODE")

    if not skip_checking_if_new:
        if not is_new(db=db, story=story):
            log.debug("Story '{}' is not new.".format(story['url']))
            db.commit()
            return None

    medium = db.find_by_id(table='media', object_id=story['media_id'])

    if story.get('full_text_rss', None) is None:
        story['full_text_rss'] = medium.get('full_text_rss', False) or False
        if len(story.get('description', '')) == 0:
            story['full_text_rss'] = False

    try:
        story = db.create(table='stories', insert_hash=story)
    except Exception as ex:
        db.rollback()

        # FIXME get rid of this, replace with native upsert on "stories_guid" unique constraint
        if 'unique constraint \"stories_guid' in str(ex):
            log.warning(
                "Failed to add story for '{}' to GUID conflict (guid = '{}')".
                format(story['url'], story['guid']))
            return None

        else:
            raise McAddStoryException(
                "Error adding story: {}\nStory: {}".format(
                    str(ex), str(story)))

    db.find_or_create(table='feeds_stories_map',
                      insert_hash={
                          'stories_id': story['stories_id'],
                          'feeds_id': feeds_id,
                      })

    db.commit()

    return story
Ejemplo n.º 38
0
def copy_story_to_new_medium(db: DatabaseHandler, topic: dict, old_story: dict, new_medium: dict) -> dict:
    """Copy story to new medium.

    Copy the given story, assigning the new media_id and copying over the download, extracted text, and so on.
    Return the new story.
    """

    story = {
        'url': old_story['url'],
        'media_id': new_medium['media_id'],
        'guid': old_story['guid'],
        'publish_date': old_story['publish_date'],
        'collect_date': mediawords.util.sql.sql_now(),
        'description': old_story['description'],
        'title': old_story['title']
    }

    story = db.create('stories', story)
    add_to_topic_stories(db=db, story=story, topic=topic, valid_foreign_rss_story=True)

    db.query(
        """
        insert into stories_tags_map (stories_id, tags_id)
            select %(a)s, stm.tags_id from stories_tags_map stm where stm.stories_id = %(b)s
        """,
        {'a': story['stories_id'], 'b': old_story['stories_id']})

    feed = get_spider_feed(db, new_medium)
    db.create('feeds_stories_map', {'feeds_id': feed['feeds_id'], 'stories_id': story['stories_id']})

    old_download = db.query(
        "select * from downloads where stories_id = %(a)s order by downloads_id limit 1",
        {'a': old_story['stories_id']}).hash()
    download = create_download_for_new_story(db, story, feed)

    if old_download is not None:
        try:
            content = mediawords.dbi.downloads.fetch_content(db, old_download)
            download = mediawords.dbi.downloads.store_content(db, download, content)
        except (mediawords.dbi.downloads.McDBIDownloadsException,
                mediawords.key_value_store.amazon_s3.McAmazonS3StoreException):
            download_update = dict([(f, old_download[f]) for f in ['state', 'error_message', 'download_time']])
            db.update_by_id('downloads', download['downloads_id'], download_update)

        db.query(
            """
            insert into download_texts (downloads_id, download_text, download_text_length)
                select %(a)s, dt.download_text, dt.download_text_length
                    from download_texts dt
                    where dt.downloads_id = %(a)s
            """,
            {'a': download['downloads_id']})

    db.query(
        """
        insert into story_sentences (stories_id, sentence_number, sentence, media_id, publish_date, language)
            select %(a)s, sentence_number, sentence, media_id, publish_date, language
                from story_sentences
                where stories_id = %(b)s
        """,
        {'a': story['stories_id'], 'b': old_story['stories_id']})

    return story
Ejemplo n.º 39
0
def login_with_email_password(db: DatabaseHandler, email: str, password: str, ip_address: str = None) -> CurrentUser:
    """Log in with username and password; raise on unsuccessful login."""

    email = decode_object_from_bytes_if_needed(email)
    password = decode_object_from_bytes_if_needed(password)

    if not (email and password):
        raise McAuthLoginException("Email and password must be defined.")

    # Try-except block because we don't want to reveal the specific reason why the login has failed
    try:

        user = user_info(db=db, email=email)

        # Check if user has tried to log in unsuccessfully before and now is trying
        # again too fast
        if __user_is_trying_to_login_too_soon(db=db, email=email):
            raise McAuthLoginException(
                "User '%s' is trying to log in too soon after the last unsuccessful attempt." % email
            )

        if not password_hash_is_valid(password_hash=user.password_hash(), password=password):
            raise McAuthLoginException("Password for user '%s' is invalid." % email)

    except Exception as ex:
        log.info(
            "Login failed for %(email)s, will delay any successive login attempt for %(delay)d seconds: %(exc)s" % {
                'email': email,
                'delay': __POST_UNSUCCESSFUL_LOGIN_DELAY,
                'exc': str(ex),
            }
        )

        # Set the unsuccessful login timestamp
        # (TIMESTAMP 'now' returns "current transaction's start time", so using LOCALTIMESTAMP instead)
        db.query("""
            UPDATE auth_users
            SET last_unsuccessful_login_attempt = LOCALTIMESTAMP
            WHERE email = %(email)s
        """, {'email': email})

        # It might make sense to time.sleep() here for the duration of $POST_UNSUCCESSFUL_LOGIN_DELAY seconds to prevent
        # legitimate users from trying to log in too fast. However, when being actually brute-forced through multiple
        # HTTP connections, this approach might end up creating a lot of processes that would time.sleep() and take up
        # memory.
        #
        # So, let's return the error page ASAP and hope that a legitimate user won't be able to reenter his / her
        # password before the $POST_UNSUCCESSFUL_LOGIN_DELAY amount of seconds pass.

        # Don't give out a specific reason for the user to not be able to find
        # out which user emails are registered
        raise McAuthLoginException("User '%s' was not found or password is incorrect." % email)

    if not user.active():
        raise McAuthLoginException("User with email '%s' is not active." % email)

    # Reset password reset token (if any)
    db.query("""
        UPDATE auth_users
        SET password_reset_token_hash = NULL
        WHERE email = %(email)s
          AND password_reset_token_hash IS NOT NULL
    """, {'email': email})

    if ip_address:
        if not user.api_key_for_ip_address(ip_address):
            db.create(
                table='auth_user_api_keys',
                insert_hash={
                    'auth_users_id': user.user_id(),
                    'ip_address': ip_address,
                })

            # Fetch user again
            user = user_info(db=db, email=email)

            if not user.api_key_for_ip_address(ip_address):
                raise McAuthLoginException("Unable to create per-IP API key for IP %s" % ip_address)

    return user
Ejemplo n.º 40
0
def generate_story(
        db: DatabaseHandler,
        url: str,
        content: str,
        fallback_date: typing.Optional[datetime.datetime] = None) -> dict:
    """Add a new story to the database by guessing metadata using the given url and content.

    This function guesses the medium, feed, title, and date of the story from the url and content.

    Arguments:
    db - db handle
    url - story url
    content - story content
    fallback_date - fallback to this date if the date guesser fails to find a date
    """
    if len(url) < 1:
        raise McTMStoriesException("url must not be an empty string")

    url = url[0:_MAX_URL_LENGTH]

    medium = mediawords.tm.media.guess_medium(db, url)
    feed = get_spider_feed(db, medium)
    spidered_tag = mediawords.tm.media.get_spidered_tag(db)
    title = mediawords.util.parse_html.html_title(content, url,
                                                  _MAX_TITLE_LENGTH)

    story = {
        'url': url,
        'guid': url,
        'media_id': medium['media_id'],
        'title': title,
        'description': ''
    }

    # postgres refuses to insert text values with the null character
    for field in ('url', 'guid', 'title'):
        story[field] = re2.sub('\x00', '', story[field])

    date_guess = guess_date(url, content)
    story[
        'publish_date'] = date_guess.date if date_guess.found else fallback_date
    if story['publish_date'] is None:
        story['publish_date'] = datetime.datetime.now().isoformat()

    try:
        story = db.create('stories', story)
    except mediawords.db.exceptions.handler.McUniqueConstraintException:
        raise McTMStoriesDuplicateException(
            "Attempt to insert duplicate story url %s" % url)
    except Exception:
        raise McTMStoriesException("Error adding story: %s" %
                                   traceback.format_exc())

    db.query(
        "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)",
        {
            'a': story['stories_id'],
            'b': spidered_tag['tags_id']
        })

    assign_date_guess_tag(db, story, date_guess, fallback_date)

    log.debug("add story: %s; %s; %s; %d" %
              (story['title'], story['url'], story['publish_date'],
               story['stories_id']))

    db.create('feeds_stories_map', {
        'stories_id': story['stories_id'],
        'feeds_id': feed['feeds_id']
    })

    download = create_download_for_new_story(db, story, feed)

    mediawords.dbi.downloads.store_content(db, download, content)

    _extract_story(db, story)

    return story
Ejemplo n.º 41
0
def add_user(db: DatabaseHandler, new_user: NewUser) -> None:
    """Add new user."""

    if not new_user:
        raise McAuthRegisterException("New user is undefined.")

    # Check if user already exists
    user_exists = db.query("""
        SELECT auth_users_id
        FROM auth_users
        WHERE email = %(email)s
        LIMIT 1
    """, {'email': new_user.email()}).hash()

    if user_exists is not None and 'auth_users_id' in user_exists:
        raise McAuthRegisterException("User with email '%s' already exists." % new_user.email())

    # Hash + validate the password
    try:
        password_hash = generate_secure_hash(password=new_user.password())
        if not password_hash:
            raise McAuthRegisterException("Password hash is empty.")
    except Exception as ex:
        log.error("Unable to hash a new password: {}".format(ex))
        raise McAuthRegisterException('Unable to hash a new password.')

    db.begin()

    # Create the user
    db.create(
        table='auth_users',
        insert_hash={
            'email': new_user.email(),
            'password_hash': password_hash,
            'full_name': new_user.full_name(),
            'notes': new_user.notes(),
            'active': bool(int(new_user.active())),
        }
    )

    # Fetch the user's ID
    try:
        user = user_info(db=db, email=new_user.email())
    except Exception as ex:
        db.rollback()
        raise McAuthRegisterException("I've attempted to create the user but it doesn't exist: %s" % str(ex))

    # Create roles
    try:
        for auth_roles_id in new_user.role_ids():
            db.create(table='auth_users_roles_map', insert_hash={
                'auth_users_id': user.user_id(),
                'auth_roles_id': auth_roles_id,
            })
    except Exception as ex:
        raise McAuthRegisterException("Unable to create roles: %s" % str(ex))

    # Update limits (if they're defined)
    if new_user.weekly_requests_limit() is not None:
        db.query("""
            UPDATE auth_user_limits
            SET weekly_requests_limit = %(weekly_requests_limit)s
            WHERE auth_users_id = %(auth_users_id)s
        """, {
            'auth_users_id': user.user_id(),
            'weekly_requests_limit': new_user.weekly_requests_limit(),
        })

    if new_user.weekly_requested_items_limit() is not None:
        db.query("""
            UPDATE auth_user_limits
            SET weekly_requested_items_limit = %(weekly_requested_items_limit)s
            WHERE auth_users_id = %(auth_users_id)s
        """, {
            'auth_users_id': user.user_id(),
            'weekly_requested_items_limit': new_user.weekly_requested_items_limit(),
        })

    # Subscribe to newsletter
    if new_user.subscribe_to_newsletter():
        db.create(table='auth_users_subscribe_to_newsletter', insert_hash={'auth_users_id': user.user_id()})

    if not new_user.active():
        send_user_activation_token(
            db=db,
            email=new_user.email(),
            activation_link=new_user.activation_url(),
            subscribe_to_newsletter=new_user.subscribe_to_newsletter(),
        )

    db.commit()