def migrate_isotopic_images(ds_id): output.print('Migrating isotopic images') db = DB() image_ids = db.select_onecol(SEL_DS_IMG_IDS, params=(ds_id,)) es_exporter = ESExporter(db, sm_config) if image_ids and not _es_docs_migrated(es_exporter._es, ds_id): with timeit(): output.print('Transferring images...') output.print(len(image_ids)) transfer_images(ds_id, 'iso_images', image_storage.ISO, image_ids) with timeit(): output.print('Reindexing ES documents...') es_exporter.reindex_ds(ds_id)
def del_jobs(ds: Dataset, moldb_ids: Optional[Iterable[int]] = None): """ Delete a dataset's jobs for the specified moldbs, or all jobs if moldb_ids is None. Also cleans up the annotations from ElasticSearch and deletes their ion images. """ db = DB() es = ESExporter(db) if moldb_ids is None: moldb_ids = get_ds_moldb_ids(ds.id) moldbs = molecular_db.find_by_ids(moldb_ids) job_ids = DB().select_onecol( 'SELECT j.id FROM job j WHERE ds_id = %s AND moldb_id = ANY(%s)', (ds.id, list(moldb_ids))) del_diagnostics(ds.id, job_ids) for moldb in moldbs: logger.info( f'Deleting isotopic images: ds_id={ds.id} ds_name={ds.name} moldb={moldb}' ) img_id_rows = db.select_onecol( 'SELECT iso_image_ids ' 'FROM annotation m ' 'JOIN job j ON j.id = m.job_id ' 'JOIN dataset d ON d.id = j.ds_id ' 'WHERE ds_id = %s AND j.moldb_id = %s', (ds.id, moldb.id), ) image_ids = [ img_id for img_ids in img_id_rows for img_id in img_ids if img_id is not None ] image_storage.delete_images(image_storage.ISO, ds.id, image_ids) logger.info( f"Deleting job results: ds_id={ds.id} ds_name={ds.name} moldb={moldb}" ) db.alter('DELETE FROM job WHERE ds_id = %s and moldb_id = %s', (ds.id, moldb.id)) es.delete_ds(ds.id, moldb)
def find_dataset_ids(ds_ids_param, sql_where, missing, failed, succeeded): db = DB() if ds_ids_param: specified_ds_ids = ds_ids_param.split(',') elif sql_where: specified_ds_ids = db.select_onecol( f"SELECT id FROM dataset WHERE {sql_where}") else: specified_ds_ids = None if not missing: # Default to processing all datasets missing diagnostics missing = specified_ds_ids is None and not failed and not succeeded ds_type_counts = db.select( 'SELECT d.id, COUNT(DISTINCT dd.type), COUNT(dd.error) ' 'FROM dataset d LEFT JOIN dataset_diagnostic dd on d.id = dd.ds_id ' 'WHERE d.status = \'FINISHED\' ' 'GROUP BY d.id') if missing or failed or succeeded: # Get ds_ids based on status (or filter specified ds_ids on status) status_ds_ids = set() for ds_id, n_diagnostics, n_errors in ds_type_counts: if missing and (n_diagnostics or 0) < len(DiagnosticType): status_ds_ids.add(ds_id) elif failed and n_errors > 0: status_ds_ids.add(ds_id) elif succeeded and n_diagnostics == len( DiagnosticType) and n_errors == 0: status_ds_ids.add(ds_id) if specified_ds_ids is not None: # Keep order, if directly specified ds_ids = [ ds_id for ds_id in specified_ds_ids if ds_id in status_ds_ids ] else: # Order by ID descending, so that newer DSs are updated first ds_ids = sorted(status_ds_ids, reverse=True) else: ds_ids = specified_ds_ids assert ds_ids, 'No datasets found' return ds_ids