def __init__(self, field_weightings={}, kill_threshold=None, match_threshold=None): self.doctors = db.query(Doctor).all() self.field_weightings = self.DEFAULT_FIELD_WEIGHTINGS self.field_weightings.update(field_weightings) self.match_threshold = match_threshold or self.DEFAULT_MATCH_THRESHOLD self.kill_threshold = kill_threshold or self.DEFAULT_KILL_THRESHOLD self.metrics = MetricsRegistry() guaranteed_metrics(self.metrics)
def main(): global metricsRegistry metricsRegistry = MetricsRegistry() #timer = metricsRegistry.timer("post_calls") #print((str(metricsRegistry._get_timer_metrics("post_calls")))) loop = uvloop.new_event_loop() asyncio.set_event_loop(loop) app = Application() app.listen(options.port) asyncio.get_event_loop().run_forever()
def setup_registry(self, registry=None): """ Setup the Metric Reporter with the given registry(s) """ if not registry: self.registry = [MetricsRegistry(), ] else: if not isinstance(registry, (tuple, list)): registry = [registry, ] self.registry = [] for reg in registry: if not isinstance(reg, (MetricsRegistry, RegexRegistry)): raise GoblinMetricsException( "%s is not an instance of pyformance.MetricsRegistry" + " or pyformance.RegexRegistry") self.registry.append(reg)
def __init__(self, filename): self.metrics = MetricsRegistry() self.field_sets = { 'cpt': set(), 'description': set(), 'price': set(), 'modifier': set(), 'all': set(), } print filename.split('.')[0] self._custom_mapping = CUSTOM_COLUMN_MAPPING.get(filename.split('.')[0]) if filename: self._input = open(settings.PARSED_FILES_PATH + '/' + filename) self._output = open(settings.EXTRACTED_FILES_PATH + '/' + '.extracted.'.join(filename.split('.')), 'w')
def fetch_docs(base_path, es_host, es_index, es_query=None, limit=-1): exp_name = "fetch_docs" exp_path = f"{base_path}/{exp_name}" os.makedirs(exp_path, exist_ok=True) run = mlflow.start_run(experiment_id=get_or_create_experiment_id(exp_name)) docs_path = f"{exp_path}/{run.run_info.run_uuid}" registry = MetricsRegistry() mlflow_reporter = MlflowReporter(registry=registry, active_run=run, reporting_interval=10) mlflow_reporter.start() influx_reporter = InfluxReporter(registry=registry, reporting_interval=10, autocreate_database=True) influx_reporter.start() try: mlflow.log_param("docs_path", docs_path) mlflow.log_param("es_host", es_host) mlflow.log_param("es_index", es_index) mlflow.log_param("es_query", es_query) _write_docs( _get_docs_scrolled(registry, es_host, es_index, es_query, limit), docs_path) influx_reporter.report_now() mlflow_reporter.report_now() mlflow.end_run() except Exception as e: mlflow.end_run("FAILED") raise e finally: influx_reporter.stop() mlflow_reporter.stop() return run
def reset_metrics(): pyformance.set_global_registry(MetricsRegistry())
class RequestHandler(Resource): # isLeaf is a way of indicate whether the object # has child nodes isLeaf = True # run server then go to: http://localhost:8001/?world_size=n # n being a number of your choice def render_GET(self, request): counter = metricsRegistry.counter("hello_called").inc() world_size = request.args["world_size"][0] histogram = metricsRegistry.histogram("world_size") histogram.add(int(world_size)) request.setResponseCode(200) return str(metricsRegistry._get_histogram_metrics("world_size")) if __name__ == '__main__': # Load up twisted web global metricsRegistry metricsRegistry = MetricsRegistry() try: resource = RequestHandler() factory = Site(resource) reactor.listenTCP(8001, factory) reactor.run() except Exception as e: print(e)
class DedupManager(object): """ a wrapper on RecordDeduper: - memoizes doctors list - handles variations in field weightings and thresholds - handles stats reporting """ DEFAULT_FIELD_WEIGHTINGS = { "last_name": [50, 'ratio', 0.7], "first_name": [10, 'ratio', 0.8], "middle_name": [20, 'ratio', 0.8], "birth_year": [10, 'exact', None], "graduation_year": [10, 'exact', None], "degree": [5, 'ratio', 0.8], "school": [3, 'ratio', 0.3], "street_address": [1, 'ratio', 0.6], "city": [1, 'ratio', 0.9], "state_code": [1, 'ratio', 0.8], "zip_code": [1, 'ratio', 0.8], "phone_number": [0, 'ratio', 0.3], } DEFAULT_MATCH_THRESHOLD = 1 DEFAULT_KILL_THRESHOLD = -5 def __init__(self, field_weightings={}, kill_threshold=None, match_threshold=None): self.doctors = db.query(Doctor).all() self.field_weightings = self.DEFAULT_FIELD_WEIGHTINGS self.field_weightings.update(field_weightings) self.match_threshold = match_threshold or self.DEFAULT_MATCH_THRESHOLD self.kill_threshold = kill_threshold or self.DEFAULT_KILL_THRESHOLD self.metrics = MetricsRegistry() guaranteed_metrics(self.metrics) def dedup(self, record): record_deduper = RecordDeduper(record, self.doctors, self.field_weightings, self.kill_threshold, self.match_threshold, self.metrics) new, doctor = record_deduper.run() if new: self.doctors.append(doctor) #METRICS if record["is_dup"]: self.metrics.counter('is_dup').inc() else: self.metrics.counter('not_dup').inc() self.metrics.counter('records').inc() if new: self.metrics.counter('new').inc() if record["is_dup"]: self.metrics.counter('false_negative').inc() else: self.metrics.counter('true_negative').inc() else: self.metrics.counter('match').inc() if not record["is_dup"]: self.metrics.counter('true_positive').inc() else: self.metrics.counter('false_positive').inc()
import asyncio import concurrent.futures import requests import os from pyformance import counter, count_calls from pyformance.registry import MetricsRegistry import signalfx.pyformance registry = MetricsRegistry() counter = registry.counter("http_requests_sent") token = '' if 'SF_TOKEN' not in os.environ: token = os.environ['SF_TOKEN'] sfx = signalfx.pyformance.SignalFxReporter(token=os.environ['SF_TOKEN'], reporting_interval=1, registry=registry) sfx.start() async def main(): with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor: target = 'http://localhost:8901' if 'TARGET_ADDRESS' in os.environ: target = os.environ['TARGET_ADDRESS'] print('Sending data to ..', target)
Send a POST request:: curl -d "foo=bar&bin=baz" http://localhost """ from http.server import BaseHTTPRequestHandler, HTTPServer import socketserver from pyformance import counter, count_calls #, timer from pyformance.registry import MetricsRegistry #from pyformance.reporters import ConsoleReporter import signalfx.pyformance import os import socket import sys registry = MetricsRegistry() counter = registry.counter("http_get_requests") #timer = registry.timer("time_calls") default_dimensions = {'containerId': socket.gethostname()} class S(BaseHTTPRequestHandler): def _set_headers(self): self.send_response(200) self.send_header('Content-type', 'text/html') self.end_headers() @count_calls def do_GET(self): #with timer.time(): self._set_headers()
class Extractor(object): def __init__(self, filename): self.metrics = MetricsRegistry() self.field_sets = { 'cpt': set(), 'description': set(), 'price': set(), 'modifier': set(), 'all': set(), } print filename.split('.')[0] self._custom_mapping = CUSTOM_COLUMN_MAPPING.get(filename.split('.')[0]) if filename: self._input = open(settings.PARSED_FILES_PATH + '/' + filename) self._output = open(settings.EXTRACTED_FILES_PATH + '/' + '.extracted.'.join(filename.split('.')), 'w') def extract(self): for line in self._input: if len(line) < 2: continue item = simplejson.loads(line, use_decimal=True) # add all field names to all field set self.field_sets['all'].update(item['contents'].keys()) price, descriptions, cpt, modifier = self._process_item(item) item['output'] = { 'price': price, 'cpt': cpt, 'descriptions': descriptions, 'modifier': modifier } self._output.write(simplejson.dumps(item, use_decimal=True) + '\n') # Write item metrics self.metrics.counter('lines').inc() if price: self.metrics.counter('prices').inc() self.field_sets['price'].add(price.keys()[0]) if cpt: self.metrics.counter('cpts').inc() self.field_sets['cpt'].add(cpt.keys()[0]) if modifier: self.metrics.counter('modifiers').inc() self.field_sets['modifier'].add(modifier.keys()[0]) if descriptions: self.metrics.counter('descriptions').inc() self.field_sets['description'].update([d.keys()[0] for d in descriptions]) if cpt and price: self.metrics.counter('coded_lines').inc() GLOBAL_METRICS.counter('coded_lines').inc() if descriptions and price: self.metrics.counter('desc_lines').inc() GLOBAL_METRICS.counter('desc_lines').inc() GLOBAL_METRICS.counter('lines').inc() GLOBAL_METRICS.counter('hospitals').inc() if self.metrics.counter('cpts').get_count() > 500: GLOBAL_METRICS.counter('500_hosp').inc() def _process_item(self, item): contents = dict([(k,v) for k,v in item['contents'].items() if v or v == 0]) price = self._find_price(contents) descriptions = self._find_descriptions(contents) cpt = self._find_cpt(contents, descriptions) modifier = self._find_modifier(contents, cpt) return (price, descriptions, cpt, modifier) def _find_price(self, contents): if self._custom_mapping and self._custom_mapping['price']: return dict((k, v) for k, v in contents.iteritems() if k == self._custom_mapping['price']) else: matches = [] for column, value in contents.iteritems(): new = self._is_price(column, value) if new: matches.append(new) if not matches: return None elif len(matches) == 1: return matches[0] else: return self._resolve_multiple_price_matches(matches) def _resolve_multiple_price_matches(self, matches): if all([match.values()[0] == 0 for match in matches]): return matches[0] matches = [match for match in matches if match.values()[0] > 0] if len(matches) == 1: return matches[0] price_only = [match for match in matches if 'price' in match.keys()[0].lower()] if price_only: matches = price_only if len(matches) == 1: return matches[0] return sorted(matches, key=lambda d: d.values()[0])[-1] def _is_price(self, column, value): # check column if not any([re.search(regex, column, re.I) for regex in PRICE_REGEXES]): return False if any([re.search(regex, column, re.I) for regex in PRICE_REGEXES_NEG + TRASH]): return False # check and clean value search = re.findall(r'\b(\d[\d\.]{0,9})\b', unicode(value), re.I) if search: fl = float(search[0]) if fl < 500000: return {column: fl} else: return False else: return False def _find_descriptions(self, contents): matches = [] for column, value in contents.iteritems(): new = self._is_description(column, value) if new: matches.append(new) return matches def _is_description(self, column, value): # check column if not any([re.search(regex, column, re.I) for regex in DESC_REGEXES]): return False if any([re.search(regex, column, re.I) for regex in DESC_REGEXES_NEG + TRASH]): return False # check and clean value if not re.search(r'^\s*[\w\.\$]+\s*$', unicode(value)): return {column: value} else: return False def _find_cpt(self, contents, descriptions): if self._custom_mapping and self._custom_mapping['cpt']: return dict((k, v) for k, v in contents.iteritems() if k == self._custom_mapping['cpt']) good_columns = [column for column in contents.keys() if self._is_cpt_column(column)] if len(good_columns) > 1: good_columns = [column for column in good_columns if re.search('cpt|hcpcs', column, re.I)] matches = [] for column in good_columns: clean_value = self._get_cpt_value(contents[column]) if clean_value: matches.append({column: clean_value}) if len(matches) > 1: matches = matches[:1] if not matches: matches.extend(self._cpt_from_descriptions(descriptions)) return matches and matches[0] or None def _is_cpt_column(self, column): if not any([re.search(regex, column, re.I) for regex in CPT_REGEXES]): return False elif any([re.search(regex, column, re.I) for regex in CPT_REGEXES_NEG + TRASH]): return False else: return True def _get_cpt_value(self, value): search = re.findall(r'\b((?:\w\d{3}\w)(?:[- ]\w\w)*)\b', unicode(value), re.UNICODE) if search and search[0][:5] in CPT_CODES_LIST: return search[0] else: return False def _cpt_from_descriptions(self, descriptions): matches = [] for desc_dict in descriptions: column, desc = desc_dict.items()[0] cpts = re.findall('cpt.{0,7}(\w\d{3}\w)', unicode(desc), re.I) if cpts: matches.append({'From desc: ' + column: cpts[0]}) return matches def _find_modifier(self, contents, cpt): matches = [] for column, value in contents.iteritems(): new = self._is_modifier(column, value) if new: matches.append(new) if not matches and cpt: from_cpt = self._modifier_from_cpt(cpt) from_cpt and matches.append(from_cpt) if len(matches) > 1: raise Exception('Found multiple modifiers %s: %s' % (contents, matches)) return matches and matches[0] or None def _is_modifier(self, column, value): # check column if not any([re.search(regex, column, re.I) for regex in MODIFIER_REGEXES]): return False if any([re.search(regex, column, re.I) for regex in MODIFIER_REGEXES_NEG + TRASH]): return False # check and clean value search = re.findall(r'\b(?:\w\d{3}\w)[- ](\w\w)\b', unicode(value), re.U) if search and search[0] in ['22', '23', '24', '25', '26', '32', '33', '47', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '62', '63', '66', '76', '77', '78', '79', '80', '81', '82', '90', '91', '92', '99', 'P1', 'P2', 'P3', 'P4', 'P5', 'P6', '25', '27', '50', '52', '58', '59', '73', '74', '76', '77', '78', '79', '91', 'E1', 'E2', 'E3', 'E4', 'F1', 'F2', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'FA', 'GG', 'GH', 'LC', 'LD', 'LT', 'QM', 'QN', 'RC', 'RT', 'T1', 'T2', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9','TA']: return {column: search[0]} else: return False def _modifier_from_cpt(self, cpt): column, value = cpt.items()[0] search = re.findall(r'\b(?:\w\d{3}\w)-(\w\w)\b', unicode(value), re.U) if search: return {'From cpt: ' + column: search[0]} else: return None