Ejemplo n.º 1
0
def get_session_lock(db: mediawords.db.DatabaseHandler, lock_type: str, lock_id: int, wait: bool = False) -> bool:
    """Get a postgres advisory lock with the lock_type and lock_id as the two keys.

    Arguments:
    db - db handle
    lock_type - must be in LOCK_TYPES dict above
    lock_id - id for the particular lock within the type
    wait - if true, block while waiting for the lock, else return false if the lock is not available

    Returns:
    True if the lock is available
    """
    lock_type = str(decode_object_from_bytes_if_needed(lock_type))

    if isinstance(lock_id, bytes):
        lock_id = decode_object_from_bytes_if_needed(lock_id)
    lock_id = int(lock_id)

    if isinstance(wait, bytes):
        wait = decode_object_from_bytes_if_needed(wait)
    wait = bool(wait)

    log.debug("trying for lock: %s, %d" % (lock_type, lock_id))

    if lock_type not in LOCK_TYPES:
        raise McDBLocksException("lock type not in LOCK_TYPES: %s" % lock_type)

    lock_type_id = LOCK_TYPES[lock_type]

    if wait:
        db.query("select pg_advisory_lock(%(a)s, %(b)s)", {'a': lock_type_id, 'b': lock_id})
        return True
    else:
        r = db.query("select pg_try_advisory_lock(%(a)s, %(b)s) as locked", {'a': lock_type_id, 'b': lock_id}).hash()
        return r['locked']
Ejemplo n.º 2
0
def release_session_lock(db: mediawords.db.DatabaseHandler, lock_type: str, lock_id: int) -> None:
    """Release the postgres advisory lock if it is held."""
    lock_type = str(decode_object_from_bytes_if_needed(lock_type))

    if isinstance(lock_id, bytes):
        lock_id = decode_object_from_bytes_if_needed(lock_id)
    lock_id = int(lock_id)

    if lock_type not in LOCK_TYPES:
        raise McDBLocksException("lock type not in LOCK_TYPES: %s" % lock_type)

    lock_type_id = LOCK_TYPES[lock_type]

    db.query("select pg_advisory_unlock(%(a)s, %(b)s)", {'a': lock_type_id, 'b': lock_id})
Ejemplo n.º 3
0
def release_session_lock(db: mediawords.db.DatabaseHandler, lock_type: str,
                         lock_id: int) -> None:
    """Release the postgres advisory lock if it is held."""
    lock_type = str(decode_object_from_bytes_if_needed(lock_type))
    lock_id = int(decode_object_from_bytes_if_needed(lock_id))

    if lock_type not in LOCK_TYPES:
        raise McDBLocksException("lock type not in LOCK_TYPES: %s" % lock_type)

    lock_type_id = LOCK_TYPES[lock_type]

    db.query("select pg_advisory_unlock(%(a)s, %(b)s)", {
        'a': lock_type_id,
        'b': lock_id
    })
Ejemplo n.º 4
0
def main():
    db = mediawords.db.connect_to_db()

    counts = db.query("""
        select class, count(*) as count, min(last_updated::date) as min_date, max(last_updated::date) as max_date
            from ( 
                select *,   
                        rank() over ( partition by class, (args->>'media_id')::int order by last_updated desc ) 
                            as media_rank,
                        args->>'media_id' as media_id
                    from job_states ) q 
            where 
                media_rank = 1 and 
                state in( 'queued', 'running') and 
                last_updated < now() - interval '1 day' 
            group by class
            order by class
    """).hashes()

    if len(counts) > 0:
        print("Long Running Jobs:\n")

    for count in counts:
        print("%s: %d (%s - %s)" % (count['class'], count['count'],
                                    count['min_date'], count['max_date']))
Ejemplo n.º 5
0
    def fetch_posts_from_api(
        self,
        query: str,
        start_date: datetime,
        end_date: datetime,
        sample: Optional[int] = None,
        page_size: Optional[int] = None,
    ) -> list:
        """Return posts from a csv that are within the given date range."""
        db = mediawords.db.connect_to_db()

        assert sample is None, "Sampling is not implemented."
        assert page_size is None, "Page size limiting is not supported."

        if self.mock_enabled:
            query = self._insert_mock_data(db, get_mock_data())

        table = query

        if re.search(r'[^[a-z][A-Z][0-9]_]', table):
            raise McPostgresGenericDataException(
                f'illegal table name: {table}')

        posts = db.query(
            f"""
            select content, publish_date, author, post_id, channel
                from {table} 
                where publish_date::timestamp between %(a)s and %(b)s
            """, {
                'a': start_date,
                'b': end_date
            }).hashes()

        return posts
Ejemplo n.º 6
0
def _get_topic_domain(db: DatabaseHandler, topic: dict, domain: str) -> dict:
    """Get a topic_domain."""
    return db.query(
        'select * from topic_domains where topics_id = %(a)s and domain = %(b)s',
        {
            'a': topic['topics_id'],
            'b': domain
        }).hash()
Ejemplo n.º 7
0
def get_session_lock(db: mediawords.db.DatabaseHandler,
                     lock_type: str,
                     lock_id: int,
                     wait: bool = False) -> bool:
    """Get a postgres advisory lock with the lock_type and lock_id as the two keys.

    Arguments:
    db - db handle
    lock_type - must be in LOCK_TYPES dict above
    lock_id - id for the particular lock within the type
    wait - if true, block while waiting for the lock, else return false if the lock is not available

    Returns:
    True if the lock is available
    """
    lock_type = str(decode_object_from_bytes_if_needed(lock_type))

    if isinstance(lock_id, bytes):
        lock_id = decode_object_from_bytes_if_needed(lock_id)
    lock_id = int(lock_id)

    if isinstance(wait, bytes):
        wait = decode_object_from_bytes_if_needed(wait)
    wait = bool(wait)

    log.debug("trying for lock: %s, %d" % (lock_type, lock_id))

    if lock_type not in LOCK_TYPES:
        raise McDBLocksException("lock type not in LOCK_TYPES: %s" % lock_type)

    lock_type_id = LOCK_TYPES[lock_type]

    if wait:
        db.query("select pg_advisory_lock(%(a)s, %(b)s)", {
            'a': lock_type_id,
            'b': lock_id
        })
        return True
    else:
        r = db.query("select pg_try_advisory_lock(%(a)s, %(b)s) as locked", {
            'a': lock_type_id,
            'b': lock_id
        }).hash()
        return r['locked']
Ejemplo n.º 8
0
    def _insert_mock_data(self, db: DatabaseHandler, data: list) -> str:
        """Insert the mock data into the test table and return the test table name."""
        table = 'postgres_post_fetcher_test'

        db.query(f"""
            create table {table} (
                id serial primary key,
                content text,
                publish_date text,
                author text,
                channel text,
                post_id text
            )
            """)

        for d in data:
            db.create(table, d)

        return table
Ejemplo n.º 9
0
    def _insert_mock_data(self, db: DatabaseHandler, data: list) -> str:
        """Insert the mock data into the test table and return the test table name."""
        table = 'postgres_post_fetcher_test'

        db.query(f"""
            CREATE TABLE {table} (
                id BIGSERIAL PRIMARY KEY,
                content TEXT NULL,
                publish_date TEXT NULL,
                author TEXT NULL,
                channel TEXT NULL,
                post_id TEXT NULL
            )
            """)

        for d in data:
            db.create(table, d)

        return table
Ejemplo n.º 10
0
def list_session_locks(db: mediawords.db.DatabaseHandler, lock_type: str) -> list:
    """Return a list of all locked ids for the given lock_type."""
    lock_type = str(decode_object_from_bytes_if_needed(lock_type))

    if lock_type not in LOCK_TYPES:
        raise McDBLocksException("lock type not in LOCK_TYPES: %s" % lock_type)

    lock_type_id = LOCK_TYPES[lock_type]

    return db.query(
        "select objid from pg_locks where locktype = 'advisory' and classid = %(a)s",
        {'a': lock_type_id}).flat()
Ejemplo n.º 11
0
def _id_exists_in_db(db: db.DatabaseHandler, guid: str) -> bool:
    """Internal method to check if item exists in the database."""
    guid_exists = db.query(
        "select 1 from stories s join media m using (media_id) where m.name = 'AP' and s.guid = %(a)s",
        {
            'a': guid
        }).hash()
    if guid_exists:
        logger.debug(
            'Story with guid: {} is already in the database -- skipping story.'
        )
        return True
    return False
Ejemplo n.º 12
0
def list_session_locks(db: mediawords.db.DatabaseHandler,
                       lock_type: str) -> list:
    """Return a list of all locked ids for the given lock_type."""
    lock_type = str(decode_object_from_bytes_if_needed(lock_type))

    if lock_type not in LOCK_TYPES:
        raise McDBLocksException("lock type not in LOCK_TYPES: %s" % lock_type)

    lock_type_id = LOCK_TYPES[lock_type]

    return db.query(
        "select objid from pg_locks where locktype = 'advisory' and classid = %(a)s",
        {
            'a': lock_type_id
        }).flat()
Ejemplo n.º 13
0
def main():
    if len(sys.argv) < 2:
        raise Error("usage: dump_topic_maps.py <snapshots_id>")

    snapshots_id = sys.argv[1]

    db = mediawords.db.connect_to_db()

    snapshot = db.require_by_id('snapshots', snapshots_id)

    timespan_maps = db.query(
        """
        select 
                t.*,
                tm.*,
                f.name focus_name, fs.name focal_set_name
            from timespans t
                join timespan_maps tm using ( timespans_id )
                left join foci f using ( foci_id )
                left join focal_sets fs using ( focal_sets_id )
            where t.snapshots_id = %(a)s
        """, {
            'a': snapshots_id
        }).hashes()

    for tm in timespan_maps:
        filename = "%s.%s" % (tm['options'].get('color_by',
                                                'default'), tm['format'])
        directory = '%d/%s/%s/%s/%s' % (
            snapshot['topics_id'],
            tm['focal_set_name'],
            tm['focus_name'],
            tm['period'],
            tm['start_date'][0:10],
        )

        os.makedirs(directory, exist_ok=True)

        full_path = "%s/%s" % (directory, filename)

        log.warning("writing %s..." % full_path)

        f = open(full_path, 'wb')

        f.write(tm['content'])
def main():
    db = mediawords.db.connect_to_db()

    counts = db.query("""
        select class, count(*) as count, min(last_updated::date) as min_date, max(last_updated::date) as max_date
            from ( 
                select *,   
                        rank() over ( partition by class, (args->>'media_id')::int order by last_updated desc ) 
                            as media_rank,
                        args->>'media_id' as media_id
                    from job_states ) q 
            where 
                media_rank = 1 and 
                state in( 'queued', 'running') and 
                last_updated < now() - interval '1 day' 
            group by class
            order by class
    """).hashes()

    if len(counts) > 0:
        print("Long Running Jobs:\n")

    for count in counts:
        print("%s: %d (%s - %s)" % (count['class'], count['count'], count['min_date'], count['max_date']))
Ejemplo n.º 15
0
def main():
    limit = sys.argv[1] if len(sys.argv) > 1 else 10000000

    db = mediawords.db.connect_to_db()

    media = db.query(
        """
        select m.*, mh.*
            from media m
                join media_health mh using ( media_id ) 
            where
                dup_media_id is null
            order by m.media_id asc limit %(a)s
    """, {
            'a': limit
        }).hashes()

    media_groups = {}

    num_media = len(media)
    for i, medium in enumerate(media):
        domain = mediawords.util.url.get_url_distinctive_domain(medium['url'])
        log.warning("%s [%d/%d]" % (domain, i, num_media))

        if domain not in media_groups:
            media_groups[domain] = []

        media_groups[domain].append(medium)

        medium['medium_domain'] = domain
        medium['dup_domain_matches'] = True

        dup_media = db.query(
            "select m.*, mh.* from media m join media_health mh using ( media_id ) where dup_media_id = %(a)s",
            {
                'a': medium['media_id']
            }).hashes()

        media_groups[domain].extend(dup_media)

        for dup_medium in dup_media:
            dup_domain = mediawords.util.url.get_url_distinctive_domain(
                dup_medium['url'])
            medium['medium_domain'] = dup_domain
            medium['dup_domain_matches'] = domain == dup_domain

    db.query("drop table if exists media_dups")
    db.query("""
        create table media_dups (
            domain text,
            media_id int
            )
        """)

    fieldnames = \
        'domain media_id dup_media_id name url medium_domain dup_domain_matches num_stories num_sentences'.split()

    db.begin()
    media_dups_rows = []
    for i, domain in enumerate(media_groups.keys()):
        log.warning("domain %s [%d/%d]" %
                    (domain, i, len(media_groups.keys())))
        media = media_groups[domain]
        if len(media) > 1:
            for m in media:
                db.query(
                    """
                    insert into media_dups (domain, media_id) values (%(a)s, %(b)s)
                    """, {
                        'a': domain,
                        'b': m['media_id']
                    })
    db.commit()
Ejemplo n.º 16
0
def main():
    db = mediawords.db.connect_to_db()

    waiting_topics = db.query("""
        with 

        topic_jobs as ( 
            select *, ( args->>'topics_id' )::bigint topics_id 
                from job_states 
                where 
                    class like '%Topic%' and 
                    coalesce( message, '' ) not like '%is already running%'
        ),

        ranked_jobs as ( select *, rank() over ( partition by topics_id order by last_updated desc ) from topic_jobs )

        select 
                t.name, j.topics_id, j.state, j.class, j.last_updated, j.job_states_id,
                coalesce( j.message, '' ) as  message, 
                j.args->>'snapshots_id' snapshots_id,
                now() n, rank r
            from ranked_jobs j
                join topics t on ( t.topics_id = ( j.args->>'topics_id' )::bigint )
            where 
                last_updated < now() - '1 day'::interval and 
                rank = 1 and 
                j.state not in ( 'completed' ) and
                coalesce( j.message, '' ) not like 'canceled%' and
                coalesce( j.message, '' ) not like '%exceeds topic max stories%' and
                coalesce( j.message, '' ) not like '%eed_query returned more than%' and
                last_updated > now() - interval '180 days'
                
            order by topics_id desc
        """).hashes()

    log.info("waiting topics: %d" % len(waiting_topics))

    hung_topics = filter(lambda x: x['state'] != 'error', waiting_topics)

    for topic in hung_topics:
        print(
            f"queueing topic: {topic['topics_id']}: {topic['name']} - {topic['state']} {topic['last_updated']}"
        )
        topics_id = topic['topics_id']
        snapshots_id = topic['snapshots_id']
        queue_job(topic, snapshots_id)

    errored_topics = filter(lambda x: x['state'] == 'error', waiting_topics)

    for topic in errored_topics:
        topics_id = topic['topics_id']
        snapshots_id = topic['snapshots_id']

        print(
            f"{topics_id}: {topic['name']} - {topic['state']} {topic['last_updated']}"
        )
        print(topic['snapshots_id'])
        print(f"\t{topic['message'][0:100]}")
        print(
            f"\thttps://topics.mediacloud.org/#/topics/{topics_id}/summary\n")

        while True:
            action = input(
                '(r)equeue, (d)elete fetch errors, (c)ancel, (i)gnore, or (f)ull message? '
            )
            if action == 'r':
                # requeue a spider job for the topic
                print('requeueing...')
                queue_job(topic, snapshots_id)
                break
            elif action == 'd':
                # delete all topic_fetch_url python errors -- do this if we know the cause of the errors
                # and want the topic to succeed any way rather than triggering a 'fetch error rate ... is greater' err
                print('deleting topic_fetch_url errors...')

                db.query(
                    "DELETE FROM topic_fetch_urls WHERE topics_id = %(a)s AND state = 'python error'",
                    {'a': topics_id})

            elif action == 'c':
                # prepend the 'canceled: ' string to the start of the error message so that topic job will be
                # ignored by future runs of this script
                print('canceling...')
                db.update_by_id('job_states', topic['job_states_id'],
                                {'message': f'canceled: {topic["message"]}'})
                break
            elif action == 'i':
                # ignore this topic for this run only
                print('ignoring...')
                break
            elif action == 'f':
                jobs = db.query(
                    """
                    SELECT * 
                        FROM job_states 
                        WHERE class LIKE '%Topic%' AND args->>'topics_id' = %(a)s::TEXT
                        ORDER BY job_states_id
                    """, {
                        'a': topics_id
                    }).hashes()
                [
                    print(
                        f"{job['last_updated']} {job['class']}\n{job['message']}\n****"
                    ) for job in jobs
                ]
def main():
    limit = sys.argv[1] if len(sys.argv) > 1 else 10000000

    db = mediawords.db.connect_to_db()

    media = db.query("""
        select m.*, mh.*
            from media m
                join media_health mh using ( media_id ) 
            where
                dup_media_id is null
            order by m.media_id asc limit %(a)s
    """,
    {'a': limit}).hashes()

    media_groups = {}

    num_media = len(media)
    for i, medium in enumerate(media):
        domain = mediawords.util.url.get_url_distinctive_domain(medium['url']) 
        log.warning("%s [%d/%d]" % (domain, i, num_media))

        if domain not in media_groups:
            media_groups[domain] = []

        media_groups[domain].append(medium)

        medium['medium_domain'] = domain
        medium['dup_domain_matches'] = True

        dup_media = db.query(
            "select m.*, mh.* from media m join media_health mh using ( media_id ) where dup_media_id = %(a)s",
            {'a': medium['media_id']}
        ).hashes()

        media_groups[domain].extend(dup_media)

        for dup_medium in dup_media:
            dup_domain = mediawords.util.url.get_url_distinctive_domain(dup_medium['url'])
            medium['medium_domain'] = dup_domain
            medium['dup_domain_matches'] = domain == dup_domain

    db.query("drop table if exists media_dups");
    db.query(
        """
        create table media_dups (
            domain text,
            media_id int
            )
        """)

    fieldnames = \
        'domain media_id dup_media_id name url medium_domain dup_domain_matches num_stories num_sentences'.split()

    db.begin()
    media_dups_rows = []
    for i, domain in enumerate(media_groups.keys()):
        log.warning("domain %s [%d/%d]" %(domain, i, len(media_groups.keys())))
        media = media_groups[domain]
        if len(media) > 1:
            for m in media:
                db.query(
                    """
                    insert into media_dups (domain, media_id) values (%(a)s, %(b)s)
                    """,
                    {'a': domain, 'b': m['media_id']})
    db.commit()
Ejemplo n.º 18
0
def _get_topic_domain(db: DatabaseHandler, topic: dict, domain: str) -> dict:
    """Get a topic_domain."""
    return db.query(
        'select * from topic_domains where topics_id = %(a)s and domain = %(b)s',
        {'a': topic['topics_id'], 'b': domain}).hash()