def frontfill_determine_needs_run_from_remote_fill_dt(self): """return true if remote date is newer than what we have""" if self.working_fill_date is None: # that is it wasn't set yet, maybe by override try: latest_local_fill_id, latest_local_fill_date = get_latest_fill_id( self.db_session) except sqlalchemy.orm.exc.NoResultFound: # in the case this is the very first run latest_local_fill_date = datetime( 2012, 1, 1).date() # when wikidata first started. wd_dir_raw = os.listdir(os.environ['HUMANIKI_DUMP_DIR']) # filter out broken links wd_dir_ls = [ os.path.join(os.environ['HUMANIKI_DUMP_DIR'], p) for p in wd_dir_raw ] wd_dir_ls_links = [ os.path.join(os.environ['HUMANIKI_DUMP_DIR'], os.readlink(l)) for l in wd_dir_ls ] wd_dir_ls_exists = [ f for f in wd_dir_ls_links if os.path.exists(f) ] # make sure the file is like YYYYMMDD.json.gz wd_dir_ls_exists_correct = [ f for f in wd_dir_ls_exists if is_wikimedia_cloud_dump_format(f) ] log.info( f'Existing and correct dump files found were {wd_dir_ls_exists_correct}' ) wd_dir_dts = [ make_dump_date_from_str( numeric_part_of_filename(dt_s, basenameittoo=True)) for dt_s in wd_dir_ls_exists_correct ] remote_later_than_local = [ fd for fd in wd_dir_dts if fd > latest_local_fill_date ] if remote_later_than_local: log.info( f"Lastest local was {latest_local_fill_date}, and {len(remote_later_than_local)} remote dts later" ) remote_infimum_date = min(remote_later_than_local) self.working_fill_date = remote_infimum_date # select the remote fill date that's earliest but still greater than local else: log.info( f"Lastest local was {latest_local_fill_date}, and nothing later from {len(wd_dir_dts)} remote dts" )
def __init__(self, config, db_session=None, fill_date=None): self.config = read_config_file(config, __file__) self.config_generation = self.config['generation'] self.db_session = db_session if db_session else session_factory() if fill_date is None: self.curr_fill, self.curr_fill_date = get_latest_fill_id( self.db_session) else: fill_dt = make_dump_date_from_str(fill_date) self.curr_fill, self.curr_fill_date = get_exact_fill_id( self.db_session, fill_dt) self.metric_combinations = None self.metric_creator = None self.metric_job = None self.pid = os.getpid()
def insert_or_skip(config, session): skip_insert = config['test']['skip_insert'] if 'skip_insert' in config[ 'test'] else False if not skip_insert: data_dir = config['generation']['example']['datadir'] num_fills = config['generation']['example']['fills'] example_len = config['generation']['example']['len'] curr_fill_id = insert_data(data_dir=data_dir, num_fills=num_fills, example_len=example_len) metrics_count = session.query(func.count(metric.fill_id)).scalar() print(f'number of metrics: {metrics_count}') # we want no metrics, a clean slate if we are inserting assert metrics_count == 0 else: # we'll still need the curr_fill otherwise curr_fill_id, curr_fill_dt = get_latest_fill_id(session) return curr_fill_id
def gap(bias, snapshot, population): latest_fill_id, latest_fill_date = get_latest_fill_id(session) return_warnings = {} errors = {} query_params = request.values # If a client explicitly asks an error to be sent back. if "error_test" in query_params.keys(): errors['test'] = repr(ValueError('sending you back an a value error')) errors['test_another'] = repr( ValueError('simulating what mutliple errors would look like')) return jsonify(errors=errors) try: # TODO include validating bias valid_request = assert_gap_request_valid(snapshot, population, query_params) except AssertionError as ae: errors['validation'] = repr(ae) #in this case fail immediately return jsonify(errors=errors) # handle snapshot requested_fill_id, requested_fill_date, snapshot_corrected = determine_fill_id( session, snapshot, latest_fill_id, latest_fill_date) # print(f"Fills {requested_fill_id} {requested_fill_date}") if snapshot_corrected: return_warnings['snapshot_corrected to'] = requested_fill_date # handle populations population_id, population_name, population_corrected = determine_population_conflict( population, query_params) if population_corrected: return_warnings['population_corrected to'] = population_name # order query params by property pid ordered_query_params, non_orderable_query_params = order_query_params( query_params) # get properties-id try: bias_property = get_pid_from_str(bias) ordered_properties = ordered_query_params.keys() properties_id = get_properties_obj( session=session, dimension_properties=ordered_properties, bias_property=bias_property) # properties_id = get_properties_id(session, ordered_properties, bias_property=bias_property) except ValueError as ve: errors['properties_id'] = repr(ve) log.exception(errors) # get coverage coverage = get_coverage(session=session, population_id=population_id, properties_id=properties_id.id, fill_id=requested_fill_id) # get aggregations-id try: aggregations_id_preds = get_aggregations_id_preds( session, ordered_query_params, non_orderable_query_params, as_subquery=True) except ValueError as ve: errors['aggregations_id_preds'] = repr(ve) log.exception(errors) # get metric try: # default the label lang to 'en' if not set label_lang = non_orderable_query_params[ 'label_lang'] if 'label_lang' in non_orderable_query_params else None metrics, represented_biases = build_metrics( session, fill_id=requested_fill_id, population_id=population_id, properties_id=properties_id, aggregations_id=aggregations_id_preds, label_lang=label_lang) except ValueError as ve: errors['metrics'] = repr(ve) # there are errors return those. if errors: return jsonify(errors=errors) meta = { 'snapshot': str(requested_fill_date), 'population': population_name, 'population_corrected': population_corrected, 'label_lang': label_lang, 'bias': bias, 'bias_property': bias_property, 'aggregation_properties': [Properties(p).name for p in properties_id.properties], 'coverage': coverage, } if represented_biases: meta['bias_labels'] = represented_biases full_response = {'meta': meta, 'metrics': metrics} return jsonify(**full_response)
get_metrics_count, get_all_snapshot_dates, get_coverage from humaniki_backend.utils import determine_population_conflict, assert_gap_request_valid, \ order_query_params, get_pid_from_str, determine_fill_id, is_property_exclusively_citizenship from humaniki_schema.queries import get_properties_obj, get_latest_fill_id from humaniki_schema.utils import Properties, make_fill_dt from humaniki_schema.log import get_logger log = get_logger(BASE_DIR=__file__) app = Flask(__name__) CORS(app) session = flask_scoped_session(session_factory, app) # Note this requires updating or the process restarting after a new fill. # TODO: have this be a function that can be called and updated. latest_fill_id, latest_fill_date = get_latest_fill_id(session) app.latest_fill_id = latest_fill_id @app.route("/") def home(): log.info('home route called') return jsonify(latest_fill_id, latest_fill_date) @app.route("/v1/available_snapshots/") def available_snapshots(): all_snaphot_dates = get_all_snapshot_dates(session) return jsonify(all_snaphot_dates)