def get_data(url, hash_result=True): """ Extract data from the global hash given a request object. If an item is successfully recovered data is returned """ request_obj = build_request_obj(url) hash_table_ref = read_pickle_data() # Traverse the hash key structure to find data # @TODO rather than iterate through REQUEST_META_BASE & # REQUEST_META_QUERY_STR look only at existing attributes logging.debug(__name__ + " - Attempting to pull data for request " \ "COHORT {0}, METRIC {1}". format(request_obj.cohort_expr, request_obj.metric)) key_sig = build_key_signature(request_obj, hash_result=hash_result) item = find_item(hash_table_ref, key_sig) if item: # item[0] will be a stringified structure that # is initialized, see set_data. try: return eval(item[0]) except SyntaxError: logging.error(__name__ + ' :: Failed to retrieve {0}'. format(key_sig)) return None else: return None
def set_password(self, password): try: password = escape(unicode(password)) self.pw_hash = generate_password_hash(password) except (TypeError, NameError) as e: logging.error(__name__ + ' :: Hash set error - ' + e.message) self.pw_hash = None
def build_key_signature(request_meta, hash_result=False): """ Given a RequestMeta object contruct a hashkey. Parameters ~~~~~~~~~~ request_meta : RequestMeta Stores request data. """ key_sig = list() # Build the key signature -- These keys must exist for key_name in REQUEST_META_BASE: key = getattr(request_meta, key_name) if key: key_sig.append(key_name + HASH_KEY_DELIMETER + key) else: logging.error(__name__ + ' :: Request must include %s. ' 'Cannot set data %s.' % (key_name, str(request_meta))) return '' # These keys may optionally exist for key_name in REQUEST_META_QUERY_STR: if hasattr(request_meta, key_name): key = getattr(request_meta, key_name) if key: key_sig.append(key_name + HASH_KEY_DELIMETER + str(key)) if hash_result: return sha1(str(key_sig).encode('utf-8')).hexdigest() else: return key_sig
def format_namespace(namespace, col='page_namespace'): """ Format the namespace condition in queries and returns the string. Expects a list of numeric namespace keys. Otherwise returns an empty condition string. ** THIS METHOD ONLY EMITS SQL SAFE STRINGS ** """ ns_cond = '' # Copy so as not to affect mutable ref namespace = deepcopy(namespace) if hasattr(namespace, '__iter__'): if len(namespace) == 1: ns_cond = '{0} = '.format(col) \ + escape_var(str(namespace.pop())) else: ns_cond = '{0} in ('.format(col) + \ ",".join(DataLoader() .cast_elems_to_string(escape_var(list(namespace)))) + ')' else: try: ns_cond = '{0} = '.format(col) + escape_var(int(namespace)) except ValueError: # No namespace condition logging.error(__name__ + ' :: Could not apply namespace ' 'condition on {0}'.format(str(namespace))) pass return ns_cond
def _get_revisions(args): """ Retrieve total set of revision records for users within timeframe """ um.log_pool_worker_start(__name__, _get_revisions.__name__, args[0], args[1]) users = args[0] state = args[1] metric_params = um.UserMetric._unpack_params(state) query_args_type = namedtuple('QueryArgs', 'date_start date_end namespace') revs = list() umpd_obj = UMP_MAP[metric_params.group](users, metric_params) try: for t in umpd_obj: revs += \ list(query_mod.rev_query(t.user, metric_params.project, query_args_type(t.start, t.end, metric_params.namespace))) except query_mod.UMQueryCallError as e: logging.error('{0}:: {1}. PID={2}'.format(__name__, e.message, os.getpid())) return [] um.log_pool_worker_end(__name__, _process_help.__name__) return revs
def get_cohort_refresh_datetime(utm_id): """ Get the latest refresh datetime of a cohort. Returns current time formatted as a string if the field is not found. """ # @TODO MOVE DB REFS INTO QUERY MODULE conn = dl.Connector(instance=settings.__cohort_data_instance__) query = """ SELECT utm_touched FROM usertags_meta WHERE utm_id = %s """ conn._cur_.execute(query, int(utm_id)) utm_touched = None try: utm_touched = conn._cur_.fetchone()[0] except ValueError: pass # Ensure the field was retrieved if not utm_touched: logging.error(__name__ + '::Missing utm_touched for cohort %s.' % str(utm_id)) utm_touched = datetime.now() del conn return utm_touched.strftime(DATETIME_STR_FORMAT)
def _process_help(args): """ Used by Threshold::process() for forking. Should not be called externally. """ state = args[1] thread_args = RevertRateArgsClass(state[0], state[1], state[2], state[3], state[4], state[6], state[7], state[8]) users = args[0] if thread_args.log_progress: logging.info(__name__ + ' :: Computing reverts on %s users (PID %s)' % (len(users), str(os.getpid()))) results_agg = list() dropped_users = 0 umpd_obj = UMP_MAP[thread_args.group](users, thread_args) for user_data in umpd_obj: total_revisions = 0.0 total_reverts = 0.0 # Call query on revert rate for each user # # 1. Obtain user registration date # 2. Compute end date based on 't' # 3. Get user revisions in time period query_args = namedtuple('QueryArgs', 'date_start date_end')\ (format_mediawiki_timestamp(user_data.start), format_mediawiki_timestamp(user_data.end)) try: revisions = query_mod.\ revert_rate_user_revs_query(user_data.user, thread_args.project, query_args) except query_mod.UMQueryCallError as e: logging.error(__name__ + ' :: Failed to ' 'get revisions: {0}'.format(e.message)) dropped_users += 1 continue results_thread = mpw.build_thread_pool(revisions, _revision_proc, thread_args.rev_threads, state) for r in results_thread: total_revisions += r[0] total_reverts += r[1] if not total_revisions: results_agg.append([user_data.user, 0.0, total_revisions]) else: results_agg.append([user_data.user, total_reverts / total_revisions, total_revisions]) if thread_args.log_progress: logging.debug(__name__ + ' :: PID {0} complete. Dropped users = {1}'. format(str(os.getpid()), dropped_users)) return results_agg
def all_urls(): """ View for listing all requests. Retrieves from cache """ # @TODO - this reads the entire cache into memory, filters will be needed # This extracts ALL data from the cache, the data is assumed to be in the # form of <hash key -> (data, key signature)> pairs. The key signature is # extracted to reconstruct the url. all_data = read_pickle_data() key_sigs = list() for key, val in all_data.iteritems(): if hasattr(val, '__iter__'): try: key_sigs.append(val[1]) except (KeyError, IndexError): logging.error(__name__ + ' :: Could not render key signature ' 'from data, key = {0}'.format(key)) # Compose urls from key sigs url_list = list() for key_sig in key_sigs: url = get_url_from_keys(key_sig, 'cohorts') url_list.append("".join(['<a href="', request.url_root, url + '">', url, '</a>'])) return render_template('all_urls.html', urls=url_list)
def __init__(self, **kwargs): super(TimeToThreshold, self).__init__(**kwargs) try: self._threshold_obj_ = self.__threshold_types[self.threshold_type_class](**kwargs) except NameError: logging.error(__name__ + "::Invalid threshold class. " "Using default (EditCountThreshold).") self._threshold_obj_ = self.EditCountThreshold(**kwargs)
def teardown(): """ When the instance is deleted store the pickled data and shutdown the job controller """ # Shutdown API handlers gracefully try: terminate_process_with_checks(job_controller_proc) terminate_process_with_checks(response_controller_proc) except Exception: logging.error(__name__ + ' :: Could not shut down callbacks.')
def req_cb_get_is_running(key, lock): lock.acquire() req_notification_queue_in.put([2, key], True) try: val = req_notification_queue_out.get(block=True, timeout=BLOCK_TIMEOUT)[0] except Empty: logging.error(__name__ + " :: req_cb_get_is_running -" " Block time expired.") return False lock.release() return val
def req_cb_get_cache_keys(lock): lock.acquire() req_notification_queue_in.put([3], block=True) try: val = req_notification_queue_out.get(block=True, timeout=BLOCK_TIMEOUT) except Empty: logging.error(__name__ + " :: req_cb_get_cache_keys -" " Block time expired.") return [] lock.release() return val
def process_responses(response_queue, msg_in): """ Pulls responses off of the queue. """ log_name = '{0} :: {1}'.format(__name__, process_responses.__name__) logging.debug(log_name + ' - STARTING...') while 1: stream = '' # Block on the response queue try: res = response_queue.get(True) request_meta = rebuild_unpacked_request(res) except Exception: logging.error(log_name + ' - Could not get request meta') continue data = response_queue.get(True) while data: stream += data try: data = response_queue.get(True, timeout=1) except Empty: break try: data = eval(stream) except Exception as e: # Report a fraction of the failed response data directly in the # logger if len(unicode(stream)) > 2000: excerpt = stream[:1000] + ' ... ' + stream[-1000:] else: excerpt = stream logging.error(log_name + ' - Request failed. {0}\n\n' \ 'data excerpt: {1}'.format(e.message, excerpt)) # Format a response that will report on the failed request stream = "OrderedDict([('status', 'Request failed.'), " \ "('exception', '" + escape(unicode(e.message)) + "')," \ "('request', '" + escape(unicode(request_meta)) + "'), " \ "('data', '" + escape(unicode(stream)) + "')])" key_sig = build_key_signature(request_meta, hash_result=True) # Set request in list to "not alive" req_cb_flag_job_complete(key_sig, REQ_NCB_LOCK) logging.debug(log_name + ' - Setting data for {0}'.format( str(request_meta))) set_data(stream, request_meta) logging.debug(log_name + ' - SHUTTING DOWN...')
def req_cb_get_url(key, lock): lock.acquire() req_notification_queue_in.put([4, key], block=True) try: val = req_notification_queue_out.get(True, timeout=BLOCK_TIMEOUT)[0] except Empty: logging.error(__name__ + ' :: req_cb_get_url -' ' Block time expired.') return '' lock.release() return val
def __future(rev_id, page_id, n, project, namespace): """ Produce the n revisions on a page after a given revision Returns a generator of revision objects """ try: future = query_mod.page_rev_hist_query(rev_id, page_id, n, project, namespace, look_ahead=True) except query_mod.UMQueryCallError as e: logging.error(__name__ + ' :: Failed to ' 'get revision future: {0}'.format(e.message)) future = list() return future
def check_password(self, password): if self.pw_hash: try: password = escape(unicode(password)) return check_password_hash(self.pw_hash, password) except (TypeError, NameError) as e: logging.error(__name__ + ' :: Hash check error - ' + e.message) return False else: return False
def __history(rev_id, page_id, n, project, namespace): """ Produce the n revisions on a page before a given revision Returns a generator of revision objects """ try: history = query_mod.page_rev_hist_query(rev_id, page_id, n, project, namespace, look_ahead=False) except query_mod.UMQueryCallError as e: logging.error(__name__ + ' :: Failed to ' 'get revision history: {0}'.format(e.message)) history = list() return history
def format_condition_in(self, field_name, item_list, include_quotes=False): """ Formats a SQL "in" condition """ if hasattr(item_list, '__iter__'): list_str = self.format_comma_separated_list( self.cast_elems_to_string(item_list), include_quotes=include_quotes) list_cond = "%s in (%s)" % (field_name, list_str) return list_cond else: logging.error(__name__ + '::format_condition_in - ' 'item_list must implement the ' 'iterable interface.') return ''
def is_valid_cohort_query(cohort_name): conn = Connector(instance=conf.__cohort_data_instance__) query = query_store[is_valid_cohort_query.__name__] query = sub_tokens(query, db=conf.__cohort_meta_instance__, table=conf.__cohort_meta_db__) params = {'utm_name' : cohort_name} conn._cur_.execute(query, params) try: cohorts = conn._cur_.fetchall() except (OperationalError, ProgrammingError) as e: logging.error(__name__ + ' :: Query failed: {0}, params = {1}'. format(query, str(params))) return False return len(cohorts) == 0
def get_column_names(self): """ Return the column names from the connection cursor (latest executed query) Return: - List(string). Column names from latest query results. """ try: column_data = self._cur_.description except AttributeError: column_data = [] logging.error(__name__ + ' :: No column description for this ' 'connection.') return [elem[0] for elem in column_data]
def register_user(self): """ Writes the user credentials to the datastore. """ # 1. Only users not already registered # 2. Ensure that the user is unique # 3. Write the user / pass to the db if not self.active: if not query_mod.get_api_user(self.name, by_id=False): query_mod.insert_api_user(self.name, self.pw_hash) logging.debug(__name__ + ' :: Added user {0}'. format(self.name)) else: logging.error(__name__ + 'Could not add user {0}'. format(self.name)) self.active = True
def is_valid_username_query(username, project): conn = Connector(instance=conf.PROJECT_DB_MAP[project]) query = query_store[is_valid_username_query.__name__] query = sub_tokens(query, db=escape_var(project)) params = {'username' : username} conn._cur_.execute(query, params) try: ids = conn._cur_.fetchall() except (OperationalError, ProgrammingError) as e: logging.error(__name__ + ' :: Query failed: {0}, params = {1}'. format(query, str(params))) raise if len(ids) == 1: return ids[0][0] else: return None
def get_users(cohort_expr): """ get users from cohort """ if search(COHORT_REGEX, cohort_expr): logging.info(__name__ + ' :: Processing cohort by expression.') users = [user for user in parse_cohorts(cohort_expr)] else: logging.info(__name__ + ' :: Processing cohort by tag name.') try: id = query_mod.get_cohort_id(cohort_expr) users = [u for u in query_mod.get_cohort_users(id)] except (IndexError, TypeError, query_mod.UMQueryCallError) as e: logging.error(__name__ + ' :: Could not retrieve users ' 'for cohort {0}: {1}'. format(cohort_expr, str(e))) return [] return users
def get_all_items(self, target): """ Retrieve all values in the target """ all_keys = list() try: with open(target, 'r') as f: lines = f.read().split('\n') for idx, line in enumerate(lines): try: item = json.loads(line) except Exception: logging.error(__name__ + ' :: Could not parse JSON ' 'from: {0}'.format(line)) continue all_keys.append(item) except IOError: with open(target, 'w'): pass return all_keys
def wrapper(users, project, args): # ensure the handles are iterable if not hasattr(users, '__iter__'): users = [users] # get query and call if hasattr(args, 'log') and args.log: logging.debug(__name__ + ':: calling "%(method)s" ' 'in "%(project)s".' % { 'method': f.__name__, 'project': project } ) # Call query escaping user and project variables for SQL injection query = f(escape_var(users), escape_var(project), args) try: conn = Connector(instance=conf.PROJECT_DB_MAP[project]) except KeyError: logging.error(__name__ + ' :: Project does not exist.') return [] except ConnectorError: logging.error(__name__ + ' :: Could not establish a connection.') raise UMQueryCallError('Could not establish a connection.') try: conn._cur_.execute(query) except ProgrammingError: logging.error(__name__ + 'Could not get edit counts - Query failed.') raise UMQueryCallError() results = [row for row in conn._cur_] del conn return results
def list_to_xsv(self, nested_list, separator='\t', log=False, outfile='list_to_xsv.out'): """ Transforms a nested list or t Parameters: - **nested_list** - List(List()). Nested list to insert to xsv. - **separator**: String. The separating character in the file. Default to tab. """ try: file_obj = open(projSet.__data_file_dir__ + outfile, 'w') except IOError as e: logging.error(__name__ + ' :: Could not open ' 'xsv for writing: %s' % e.message) return if hasattr(nested_list, '__iter__'): for elem in nested_list: new_elems = self.cast_elems_to_string(elem) line_in = separator.join(new_elems) + '\n' try: file_obj.write(line_in) except IOError: if log: logging.error('Could not write: "%s"' % str(line_in.strip())) else: logging.error('Expected an iterable to write to file.') file_obj.close()
def get_cohort_id(utm_name): """ Pull cohort ids from cohort handles """ # @TODO MOVE DB REFS INTO QUERY MODULE conn = dl.Connector(instance='slave') conn._cur_.execute('SELECT utm_id FROM usertags_meta ' 'WHERE utm_name = "%s"' % str(escape(utm_name))) utm_id = None try: utm_id = conn._cur_.fetchone()[0] except ValueError: pass # Ensure the field was retrieved if not utm_id: logging.error(__name__ + '::Missing utm_id for cohort %s.' % str(utm_name)) utm_id = -1 del conn return utm_id
def _map_request_values(request_meta): """ Map values from the request. Use ``REQUEST_VALUE_MAPPING`` convert coded values from the request if a familiar encoding is present. Parameters ~~~~~~~~~~ request_meta : recordtype: Stores the request data. """ for attr in REQUEST_VALUE_MAPPING: if hasattr(request_meta, attr): request_value = None try: request_value = getattr(request_meta, attr) map_val = REQUEST_VALUE_MAPPING[attr][request_value] setattr(request_meta, attr, map_val) except KeyError: logging.error(__name__ + ' :: Could not map request value ' '{0} for variable {1}.'. format(str(request_value), attr))
def _process_help(args): """ Worker thread method for NamespaceOfEdits::process(). """ users = args[0] state = args[1] metric_params = um.UserMetric._unpack_params(state) query_args_type = namedtuple('QueryArgs', 'start end') if metric_params.log_: logging.info(__name__ + '::Computing namespace edits. (PID = %s)' % getpid()) # Tally counts of namespace edits results = dict() ump_res = UMP_MAP[metric_params.group](users, metric_params) for ump_rec in ump_res: results[str(ump_rec.user)] = OrderedDict() for ns in NamespaceEdits.VALID_NAMESPACES: results[str(ump_rec.user)][str(ns)] = 0 query_results = query_mod.namespace_edits_rev_query([ump_rec.user], metric_params.project, query_args_type(ump_rec.start, ump_rec.end)) for row in query_results: try: if row[1] in NamespaceEdits.VALID_NAMESPACES: results[str(row[0])][str(row[1])] = int(row[2]) except (KeyError, IndexError): logging.error(__name__ + "::Could not process row: %s" % str(row)) continue return [(user, results[user]) for user in results]
def pop(self, target): """ Pop the top value from the list """ try: with open(target, 'r') as f: contents = f.read() if contents: lines = contents.split('\n') if len(lines): try: item = json.loads(lines[0]) key = item.keys()[0] except (KeyError, ValueError): logging.error(__name__ + ' :: FileBroker.pop - ' 'Could not parse key.') return None self.remove(target, key) return item[key] except IOError: with open(target, 'w'): pass return None