def get_session_lock(db: mediawords.db.DatabaseHandler, lock_type: str, lock_id: int, wait: bool = False) -> bool: """Get a postgres advisory lock with the lock_type and lock_id as the two keys. Arguments: db - db handle lock_type - must be in LOCK_TYPES dict above lock_id - id for the particular lock within the type wait - if true, block while waiting for the lock, else return false if the lock is not available Returns: True if the lock is available """ lock_type = str(decode_object_from_bytes_if_needed(lock_type)) if isinstance(lock_id, bytes): lock_id = decode_object_from_bytes_if_needed(lock_id) lock_id = int(lock_id) if isinstance(wait, bytes): wait = decode_object_from_bytes_if_needed(wait) wait = bool(wait) log.debug("trying for lock: %s, %d" % (lock_type, lock_id)) if lock_type not in LOCK_TYPES: raise McDBLocksException("lock type not in LOCK_TYPES: %s" % lock_type) lock_type_id = LOCK_TYPES[lock_type] if wait: db.query("select pg_advisory_lock(%(a)s, %(b)s)", {'a': lock_type_id, 'b': lock_id}) return True else: r = db.query("select pg_try_advisory_lock(%(a)s, %(b)s) as locked", {'a': lock_type_id, 'b': lock_id}).hash() return r['locked']
def release_session_lock(db: mediawords.db.DatabaseHandler, lock_type: str, lock_id: int) -> None: """Release the postgres advisory lock if it is held.""" lock_type = str(decode_object_from_bytes_if_needed(lock_type)) if isinstance(lock_id, bytes): lock_id = decode_object_from_bytes_if_needed(lock_id) lock_id = int(lock_id) if lock_type not in LOCK_TYPES: raise McDBLocksException("lock type not in LOCK_TYPES: %s" % lock_type) lock_type_id = LOCK_TYPES[lock_type] db.query("select pg_advisory_unlock(%(a)s, %(b)s)", {'a': lock_type_id, 'b': lock_id})
def release_session_lock(db: mediawords.db.DatabaseHandler, lock_type: str, lock_id: int) -> None: """Release the postgres advisory lock if it is held.""" lock_type = str(decode_object_from_bytes_if_needed(lock_type)) lock_id = int(decode_object_from_bytes_if_needed(lock_id)) if lock_type not in LOCK_TYPES: raise McDBLocksException("lock type not in LOCK_TYPES: %s" % lock_type) lock_type_id = LOCK_TYPES[lock_type] db.query("select pg_advisory_unlock(%(a)s, %(b)s)", { 'a': lock_type_id, 'b': lock_id })
def main(): db = mediawords.db.connect_to_db() counts = db.query(""" select class, count(*) as count, min(last_updated::date) as min_date, max(last_updated::date) as max_date from ( select *, rank() over ( partition by class, (args->>'media_id')::int order by last_updated desc ) as media_rank, args->>'media_id' as media_id from job_states ) q where media_rank = 1 and state in( 'queued', 'running') and last_updated < now() - interval '1 day' group by class order by class """).hashes() if len(counts) > 0: print("Long Running Jobs:\n") for count in counts: print("%s: %d (%s - %s)" % (count['class'], count['count'], count['min_date'], count['max_date']))
def fetch_posts_from_api( self, query: str, start_date: datetime, end_date: datetime, sample: Optional[int] = None, page_size: Optional[int] = None, ) -> list: """Return posts from a csv that are within the given date range.""" db = mediawords.db.connect_to_db() assert sample is None, "Sampling is not implemented." assert page_size is None, "Page size limiting is not supported." if self.mock_enabled: query = self._insert_mock_data(db, get_mock_data()) table = query if re.search(r'[^[a-z][A-Z][0-9]_]', table): raise McPostgresGenericDataException( f'illegal table name: {table}') posts = db.query( f""" select content, publish_date, author, post_id, channel from {table} where publish_date::timestamp between %(a)s and %(b)s """, { 'a': start_date, 'b': end_date }).hashes() return posts
def _get_topic_domain(db: DatabaseHandler, topic: dict, domain: str) -> dict: """Get a topic_domain.""" return db.query( 'select * from topic_domains where topics_id = %(a)s and domain = %(b)s', { 'a': topic['topics_id'], 'b': domain }).hash()
def get_session_lock(db: mediawords.db.DatabaseHandler, lock_type: str, lock_id: int, wait: bool = False) -> bool: """Get a postgres advisory lock with the lock_type and lock_id as the two keys. Arguments: db - db handle lock_type - must be in LOCK_TYPES dict above lock_id - id for the particular lock within the type wait - if true, block while waiting for the lock, else return false if the lock is not available Returns: True if the lock is available """ lock_type = str(decode_object_from_bytes_if_needed(lock_type)) if isinstance(lock_id, bytes): lock_id = decode_object_from_bytes_if_needed(lock_id) lock_id = int(lock_id) if isinstance(wait, bytes): wait = decode_object_from_bytes_if_needed(wait) wait = bool(wait) log.debug("trying for lock: %s, %d" % (lock_type, lock_id)) if lock_type not in LOCK_TYPES: raise McDBLocksException("lock type not in LOCK_TYPES: %s" % lock_type) lock_type_id = LOCK_TYPES[lock_type] if wait: db.query("select pg_advisory_lock(%(a)s, %(b)s)", { 'a': lock_type_id, 'b': lock_id }) return True else: r = db.query("select pg_try_advisory_lock(%(a)s, %(b)s) as locked", { 'a': lock_type_id, 'b': lock_id }).hash() return r['locked']
def _insert_mock_data(self, db: DatabaseHandler, data: list) -> str: """Insert the mock data into the test table and return the test table name.""" table = 'postgres_post_fetcher_test' db.query(f""" create table {table} ( id serial primary key, content text, publish_date text, author text, channel text, post_id text ) """) for d in data: db.create(table, d) return table
def _insert_mock_data(self, db: DatabaseHandler, data: list) -> str: """Insert the mock data into the test table and return the test table name.""" table = 'postgres_post_fetcher_test' db.query(f""" CREATE TABLE {table} ( id BIGSERIAL PRIMARY KEY, content TEXT NULL, publish_date TEXT NULL, author TEXT NULL, channel TEXT NULL, post_id TEXT NULL ) """) for d in data: db.create(table, d) return table
def list_session_locks(db: mediawords.db.DatabaseHandler, lock_type: str) -> list: """Return a list of all locked ids for the given lock_type.""" lock_type = str(decode_object_from_bytes_if_needed(lock_type)) if lock_type not in LOCK_TYPES: raise McDBLocksException("lock type not in LOCK_TYPES: %s" % lock_type) lock_type_id = LOCK_TYPES[lock_type] return db.query( "select objid from pg_locks where locktype = 'advisory' and classid = %(a)s", {'a': lock_type_id}).flat()
def _id_exists_in_db(db: db.DatabaseHandler, guid: str) -> bool: """Internal method to check if item exists in the database.""" guid_exists = db.query( "select 1 from stories s join media m using (media_id) where m.name = 'AP' and s.guid = %(a)s", { 'a': guid }).hash() if guid_exists: logger.debug( 'Story with guid: {} is already in the database -- skipping story.' ) return True return False
def list_session_locks(db: mediawords.db.DatabaseHandler, lock_type: str) -> list: """Return a list of all locked ids for the given lock_type.""" lock_type = str(decode_object_from_bytes_if_needed(lock_type)) if lock_type not in LOCK_TYPES: raise McDBLocksException("lock type not in LOCK_TYPES: %s" % lock_type) lock_type_id = LOCK_TYPES[lock_type] return db.query( "select objid from pg_locks where locktype = 'advisory' and classid = %(a)s", { 'a': lock_type_id }).flat()
def main(): if len(sys.argv) < 2: raise Error("usage: dump_topic_maps.py <snapshots_id>") snapshots_id = sys.argv[1] db = mediawords.db.connect_to_db() snapshot = db.require_by_id('snapshots', snapshots_id) timespan_maps = db.query( """ select t.*, tm.*, f.name focus_name, fs.name focal_set_name from timespans t join timespan_maps tm using ( timespans_id ) left join foci f using ( foci_id ) left join focal_sets fs using ( focal_sets_id ) where t.snapshots_id = %(a)s """, { 'a': snapshots_id }).hashes() for tm in timespan_maps: filename = "%s.%s" % (tm['options'].get('color_by', 'default'), tm['format']) directory = '%d/%s/%s/%s/%s' % ( snapshot['topics_id'], tm['focal_set_name'], tm['focus_name'], tm['period'], tm['start_date'][0:10], ) os.makedirs(directory, exist_ok=True) full_path = "%s/%s" % (directory, filename) log.warning("writing %s..." % full_path) f = open(full_path, 'wb') f.write(tm['content'])
def main(): limit = sys.argv[1] if len(sys.argv) > 1 else 10000000 db = mediawords.db.connect_to_db() media = db.query( """ select m.*, mh.* from media m join media_health mh using ( media_id ) where dup_media_id is null order by m.media_id asc limit %(a)s """, { 'a': limit }).hashes() media_groups = {} num_media = len(media) for i, medium in enumerate(media): domain = mediawords.util.url.get_url_distinctive_domain(medium['url']) log.warning("%s [%d/%d]" % (domain, i, num_media)) if domain not in media_groups: media_groups[domain] = [] media_groups[domain].append(medium) medium['medium_domain'] = domain medium['dup_domain_matches'] = True dup_media = db.query( "select m.*, mh.* from media m join media_health mh using ( media_id ) where dup_media_id = %(a)s", { 'a': medium['media_id'] }).hashes() media_groups[domain].extend(dup_media) for dup_medium in dup_media: dup_domain = mediawords.util.url.get_url_distinctive_domain( dup_medium['url']) medium['medium_domain'] = dup_domain medium['dup_domain_matches'] = domain == dup_domain db.query("drop table if exists media_dups") db.query(""" create table media_dups ( domain text, media_id int ) """) fieldnames = \ 'domain media_id dup_media_id name url medium_domain dup_domain_matches num_stories num_sentences'.split() db.begin() media_dups_rows = [] for i, domain in enumerate(media_groups.keys()): log.warning("domain %s [%d/%d]" % (domain, i, len(media_groups.keys()))) media = media_groups[domain] if len(media) > 1: for m in media: db.query( """ insert into media_dups (domain, media_id) values (%(a)s, %(b)s) """, { 'a': domain, 'b': m['media_id'] }) db.commit()
def main(): db = mediawords.db.connect_to_db() waiting_topics = db.query(""" with topic_jobs as ( select *, ( args->>'topics_id' )::bigint topics_id from job_states where class like '%Topic%' and coalesce( message, '' ) not like '%is already running%' ), ranked_jobs as ( select *, rank() over ( partition by topics_id order by last_updated desc ) from topic_jobs ) select t.name, j.topics_id, j.state, j.class, j.last_updated, j.job_states_id, coalesce( j.message, '' ) as message, j.args->>'snapshots_id' snapshots_id, now() n, rank r from ranked_jobs j join topics t on ( t.topics_id = ( j.args->>'topics_id' )::bigint ) where last_updated < now() - '1 day'::interval and rank = 1 and j.state not in ( 'completed' ) and coalesce( j.message, '' ) not like 'canceled%' and coalesce( j.message, '' ) not like '%exceeds topic max stories%' and coalesce( j.message, '' ) not like '%eed_query returned more than%' and last_updated > now() - interval '180 days' order by topics_id desc """).hashes() log.info("waiting topics: %d" % len(waiting_topics)) hung_topics = filter(lambda x: x['state'] != 'error', waiting_topics) for topic in hung_topics: print( f"queueing topic: {topic['topics_id']}: {topic['name']} - {topic['state']} {topic['last_updated']}" ) topics_id = topic['topics_id'] snapshots_id = topic['snapshots_id'] queue_job(topic, snapshots_id) errored_topics = filter(lambda x: x['state'] == 'error', waiting_topics) for topic in errored_topics: topics_id = topic['topics_id'] snapshots_id = topic['snapshots_id'] print( f"{topics_id}: {topic['name']} - {topic['state']} {topic['last_updated']}" ) print(topic['snapshots_id']) print(f"\t{topic['message'][0:100]}") print( f"\thttps://topics.mediacloud.org/#/topics/{topics_id}/summary\n") while True: action = input( '(r)equeue, (d)elete fetch errors, (c)ancel, (i)gnore, or (f)ull message? ' ) if action == 'r': # requeue a spider job for the topic print('requeueing...') queue_job(topic, snapshots_id) break elif action == 'd': # delete all topic_fetch_url python errors -- do this if we know the cause of the errors # and want the topic to succeed any way rather than triggering a 'fetch error rate ... is greater' err print('deleting topic_fetch_url errors...') db.query( "DELETE FROM topic_fetch_urls WHERE topics_id = %(a)s AND state = 'python error'", {'a': topics_id}) elif action == 'c': # prepend the 'canceled: ' string to the start of the error message so that topic job will be # ignored by future runs of this script print('canceling...') db.update_by_id('job_states', topic['job_states_id'], {'message': f'canceled: {topic["message"]}'}) break elif action == 'i': # ignore this topic for this run only print('ignoring...') break elif action == 'f': jobs = db.query( """ SELECT * FROM job_states WHERE class LIKE '%Topic%' AND args->>'topics_id' = %(a)s::TEXT ORDER BY job_states_id """, { 'a': topics_id }).hashes() [ print( f"{job['last_updated']} {job['class']}\n{job['message']}\n****" ) for job in jobs ]
def main(): limit = sys.argv[1] if len(sys.argv) > 1 else 10000000 db = mediawords.db.connect_to_db() media = db.query(""" select m.*, mh.* from media m join media_health mh using ( media_id ) where dup_media_id is null order by m.media_id asc limit %(a)s """, {'a': limit}).hashes() media_groups = {} num_media = len(media) for i, medium in enumerate(media): domain = mediawords.util.url.get_url_distinctive_domain(medium['url']) log.warning("%s [%d/%d]" % (domain, i, num_media)) if domain not in media_groups: media_groups[domain] = [] media_groups[domain].append(medium) medium['medium_domain'] = domain medium['dup_domain_matches'] = True dup_media = db.query( "select m.*, mh.* from media m join media_health mh using ( media_id ) where dup_media_id = %(a)s", {'a': medium['media_id']} ).hashes() media_groups[domain].extend(dup_media) for dup_medium in dup_media: dup_domain = mediawords.util.url.get_url_distinctive_domain(dup_medium['url']) medium['medium_domain'] = dup_domain medium['dup_domain_matches'] = domain == dup_domain db.query("drop table if exists media_dups"); db.query( """ create table media_dups ( domain text, media_id int ) """) fieldnames = \ 'domain media_id dup_media_id name url medium_domain dup_domain_matches num_stories num_sentences'.split() db.begin() media_dups_rows = [] for i, domain in enumerate(media_groups.keys()): log.warning("domain %s [%d/%d]" %(domain, i, len(media_groups.keys()))) media = media_groups[domain] if len(media) > 1: for m in media: db.query( """ insert into media_dups (domain, media_id) values (%(a)s, %(b)s) """, {'a': domain, 'b': m['media_id']}) db.commit()
def _get_topic_domain(db: DatabaseHandler, topic: dict, domain: str) -> dict: """Get a topic_domain.""" return db.query( 'select * from topic_domains where topics_id = %(a)s and domain = %(b)s', {'a': topic['topics_id'], 'b': domain}).hash()