Beispiel #1
0
 def __init__(self, field_weightings={}, kill_threshold=None,
         match_threshold=None):
     self.doctors = db.query(Doctor).all()
     self.field_weightings = self.DEFAULT_FIELD_WEIGHTINGS
     self.field_weightings.update(field_weightings)
     self.match_threshold = match_threshold or self.DEFAULT_MATCH_THRESHOLD
     self.kill_threshold = kill_threshold or self.DEFAULT_KILL_THRESHOLD
     self.metrics = MetricsRegistry()
     guaranteed_metrics(self.metrics)
Beispiel #2
0
def main():
    global metricsRegistry
    metricsRegistry = MetricsRegistry()
    #timer = metricsRegistry.timer("post_calls")
    #print((str(metricsRegistry._get_timer_metrics("post_calls"))))
    loop = uvloop.new_event_loop()
    asyncio.set_event_loop(loop)
    app = Application()
    app.listen(options.port)
    asyncio.get_event_loop().run_forever()
Beispiel #3
0
 def setup_registry(self, registry=None):
     """ Setup the Metric Reporter with the given registry(s) """
     if not registry:
         self.registry = [MetricsRegistry(), ]
     else:
         if not isinstance(registry, (tuple, list)):
             registry = [registry, ]
         self.registry = []
         for reg in registry:
             if not isinstance(reg, (MetricsRegistry, RegexRegistry)):
                 raise GoblinMetricsException(
                     "%s is not an instance of pyformance.MetricsRegistry" +
                     " or pyformance.RegexRegistry")
             self.registry.append(reg)
Beispiel #4
0
 def __init__(self, filename):
     self.metrics = MetricsRegistry()
     self.field_sets = {
         'cpt': set(),
         'description': set(),
         'price': set(),
         'modifier': set(),
         'all': set(),
     }
     print filename.split('.')[0]
     self._custom_mapping = CUSTOM_COLUMN_MAPPING.get(filename.split('.')[0])
     if filename:
         self._input = open(settings.PARSED_FILES_PATH + '/' + filename)
     self._output = open(settings.EXTRACTED_FILES_PATH + '/' + '.extracted.'.join(filename.split('.')), 'w')
Beispiel #5
0
def fetch_docs(base_path, es_host, es_index, es_query=None, limit=-1):
    exp_name = "fetch_docs"
    exp_path = f"{base_path}/{exp_name}"
    os.makedirs(exp_path, exist_ok=True)

    run = mlflow.start_run(experiment_id=get_or_create_experiment_id(exp_name))
    docs_path = f"{exp_path}/{run.run_info.run_uuid}"

    registry = MetricsRegistry()

    mlflow_reporter = MlflowReporter(registry=registry,
                                     active_run=run,
                                     reporting_interval=10)
    mlflow_reporter.start()

    influx_reporter = InfluxReporter(registry=registry,
                                     reporting_interval=10,
                                     autocreate_database=True)
    influx_reporter.start()

    try:
        mlflow.log_param("docs_path", docs_path)
        mlflow.log_param("es_host", es_host)
        mlflow.log_param("es_index", es_index)
        mlflow.log_param("es_query", es_query)

        _write_docs(
            _get_docs_scrolled(registry, es_host, es_index, es_query, limit),
            docs_path)

        influx_reporter.report_now()
        mlflow_reporter.report_now()

        mlflow.end_run()
    except Exception as e:
        mlflow.end_run("FAILED")
        raise e
    finally:
        influx_reporter.stop()
        mlflow_reporter.stop()

    return run
Beispiel #6
0
def reset_metrics():
    pyformance.set_global_registry(MetricsRegistry())
Beispiel #7
0

class RequestHandler(Resource):

    # isLeaf is a way of indicate whether the object
    # has child nodes
    isLeaf = True

    # run server then go to: http://localhost:8001/?world_size=n
    # n being a number of your choice
    def render_GET(self, request):
        counter = metricsRegistry.counter("hello_called").inc()
        world_size = request.args["world_size"][0]
        histogram = metricsRegistry.histogram("world_size")
        histogram.add(int(world_size))
        request.setResponseCode(200)
        return str(metricsRegistry._get_histogram_metrics("world_size"))


if __name__ == '__main__':
    # Load up twisted web
    global metricsRegistry
    metricsRegistry = MetricsRegistry()
    try:
        resource = RequestHandler()
        factory = Site(resource)
        reactor.listenTCP(8001, factory)
        reactor.run()
    except Exception as e:
        print(e)
Beispiel #8
0
class DedupManager(object):
    """
    a wrapper on RecordDeduper:
    - memoizes doctors list
    - handles variations in field weightings and thresholds
    - handles stats reporting
    """

    DEFAULT_FIELD_WEIGHTINGS = {
        "last_name": [50, 'ratio', 0.7],
        "first_name": [10, 'ratio', 0.8],
        "middle_name": [20, 'ratio', 0.8],
        "birth_year": [10, 'exact', None],
        "graduation_year": [10, 'exact', None],
        "degree": [5, 'ratio', 0.8],
        "school": [3, 'ratio', 0.3],
        "street_address": [1, 'ratio', 0.6],
        "city": [1, 'ratio', 0.9],
        "state_code": [1, 'ratio', 0.8],
        "zip_code": [1, 'ratio', 0.8],
        "phone_number": [0, 'ratio', 0.3],
    }
    DEFAULT_MATCH_THRESHOLD = 1
    DEFAULT_KILL_THRESHOLD = -5

    def __init__(self, field_weightings={}, kill_threshold=None,
            match_threshold=None):
        self.doctors = db.query(Doctor).all()
        self.field_weightings = self.DEFAULT_FIELD_WEIGHTINGS
        self.field_weightings.update(field_weightings)
        self.match_threshold = match_threshold or self.DEFAULT_MATCH_THRESHOLD
        self.kill_threshold = kill_threshold or self.DEFAULT_KILL_THRESHOLD
        self.metrics = MetricsRegistry()
        guaranteed_metrics(self.metrics)

    def dedup(self, record):
        record_deduper = RecordDeduper(record, self.doctors, self.field_weightings,
            self.kill_threshold, self.match_threshold, self.metrics)

        new, doctor = record_deduper.run()

        if new: self.doctors.append(doctor)

        #METRICS
        if record["is_dup"]:
            self.metrics.counter('is_dup').inc()
        else:
            self.metrics.counter('not_dup').inc()

        self.metrics.counter('records').inc()
        if new:
            self.metrics.counter('new').inc()
            if record["is_dup"]:
                self.metrics.counter('false_negative').inc()
            else:
                self.metrics.counter('true_negative').inc()
        else:
            self.metrics.counter('match').inc()
            if not record["is_dup"]:
                self.metrics.counter('true_positive').inc()
            else:
                self.metrics.counter('false_positive').inc()
import asyncio
import concurrent.futures
import requests
import os
from pyformance import counter, count_calls
from pyformance.registry import MetricsRegistry
import signalfx.pyformance

registry = MetricsRegistry()
counter = registry.counter("http_requests_sent")

token = ''

if 'SF_TOKEN' not in os.environ:
    token = os.environ['SF_TOKEN']

sfx = signalfx.pyformance.SignalFxReporter(token=os.environ['SF_TOKEN'],
                                           reporting_interval=1,
                                           registry=registry)

sfx.start()


async def main():

    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
        target = 'http://localhost:8901'
        if 'TARGET_ADDRESS' in os.environ:
            target = os.environ['TARGET_ADDRESS']

        print('Sending data to ..', target)
Send a POST request::
    curl -d "foo=bar&bin=baz" http://localhost

"""
from http.server import BaseHTTPRequestHandler, HTTPServer
import socketserver
from pyformance import counter, count_calls  #, timer
from pyformance.registry import MetricsRegistry
#from pyformance.reporters import ConsoleReporter
import signalfx.pyformance
import os
import socket
import sys

registry = MetricsRegistry()
counter = registry.counter("http_get_requests")
#timer = registry.timer("time_calls")
default_dimensions = {'containerId': socket.gethostname()}


class S(BaseHTTPRequestHandler):
    def _set_headers(self):
        self.send_response(200)
        self.send_header('Content-type', 'text/html')
        self.end_headers()

    @count_calls
    def do_GET(self):
        #with timer.time():
        self._set_headers()
Beispiel #11
0
class Extractor(object):

    def __init__(self, filename):
        self.metrics = MetricsRegistry()
        self.field_sets = {
            'cpt': set(),
            'description': set(),
            'price': set(),
            'modifier': set(),
            'all': set(),
        }
        print filename.split('.')[0]
        self._custom_mapping = CUSTOM_COLUMN_MAPPING.get(filename.split('.')[0])
        if filename:
            self._input = open(settings.PARSED_FILES_PATH + '/' + filename)
        self._output = open(settings.EXTRACTED_FILES_PATH + '/' + '.extracted.'.join(filename.split('.')), 'w')

    def extract(self):
        for line in self._input:
            if len(line) < 2:
                continue
            item = simplejson.loads(line, use_decimal=True)

            # add all field names to all field set
            self.field_sets['all'].update(item['contents'].keys())

            price, descriptions, cpt, modifier = self._process_item(item)

            item['output'] = {
                'price': price,
                'cpt': cpt,
                'descriptions': descriptions,
                'modifier': modifier
            }
            self._output.write(simplejson.dumps(item, use_decimal=True) + '\n')

            # Write item metrics
            self.metrics.counter('lines').inc()
            if price:
                self.metrics.counter('prices').inc()
                self.field_sets['price'].add(price.keys()[0])
            if cpt:
                self.metrics.counter('cpts').inc()
                self.field_sets['cpt'].add(cpt.keys()[0])
            if modifier:
                self.metrics.counter('modifiers').inc()
                self.field_sets['modifier'].add(modifier.keys()[0])
            if descriptions:
                self.metrics.counter('descriptions').inc()
                self.field_sets['description'].update([d.keys()[0] for d in descriptions])

            if cpt and price:
                self.metrics.counter('coded_lines').inc()
                GLOBAL_METRICS.counter('coded_lines').inc()

            if descriptions and price:
                self.metrics.counter('desc_lines').inc()
                GLOBAL_METRICS.counter('desc_lines').inc()

            GLOBAL_METRICS.counter('lines').inc()
        GLOBAL_METRICS.counter('hospitals').inc()
        if self.metrics.counter('cpts').get_count() > 500:
            GLOBAL_METRICS.counter('500_hosp').inc()

    def _process_item(self, item):
        contents = dict([(k,v) for k,v in item['contents'].items() if v or v == 0])

        price = self._find_price(contents)
        descriptions = self._find_descriptions(contents)
        cpt = self._find_cpt(contents, descriptions)
        modifier = self._find_modifier(contents, cpt)

        return (price, descriptions, cpt, modifier)

    def _find_price(self, contents):
        if self._custom_mapping and self._custom_mapping['price']:
            return dict((k, v) for k, v in contents.iteritems() if k == self._custom_mapping['price'])
        else:
            matches = []
            for column, value in contents.iteritems():
                new = self._is_price(column, value)
                if new:
                    matches.append(new)

            if not matches:
                return None
            elif len(matches) == 1:
                return matches[0]
            else:
                return self._resolve_multiple_price_matches(matches)

    def _resolve_multiple_price_matches(self, matches):
        if all([match.values()[0] == 0 for match in matches]):
            return matches[0]
        matches = [match for match in matches if match.values()[0] > 0]
        if len(matches) == 1:
            return matches[0]

        price_only = [match for match in matches if 'price' in match.keys()[0].lower()]
        if price_only:
            matches = price_only

        if len(matches) == 1:
            return matches[0]

        return sorted(matches, key=lambda d: d.values()[0])[-1]

    def _is_price(self, column, value):
        # check column
        if not any([re.search(regex, column, re.I) for regex in PRICE_REGEXES]):
            return False

        if any([re.search(regex, column, re.I) for regex in PRICE_REGEXES_NEG + TRASH]):
            return False

        # check and clean value
        search = re.findall(r'\b(\d[\d\.]{0,9})\b', unicode(value), re.I)
        if search:
            fl = float(search[0])
            if fl < 500000:
                return {column: fl}
            else:
                return False
        else:
            return False

    def _find_descriptions(self, contents):
        matches = []

        for column, value in contents.iteritems():
            new = self._is_description(column, value)
            if new:
                matches.append(new)

        return matches

    def _is_description(self, column, value):
        # check column
        if not any([re.search(regex, column, re.I) for regex in DESC_REGEXES]):
            return False

        if any([re.search(regex, column, re.I) for regex in DESC_REGEXES_NEG + TRASH]):
            return False

        # check and clean value
        if not re.search(r'^\s*[\w\.\$]+\s*$', unicode(value)):
            return {column: value}
        else:
            return False

    def _find_cpt(self, contents, descriptions):
        if self._custom_mapping and self._custom_mapping['cpt']:
            return dict((k, v) for k, v in contents.iteritems() if k == self._custom_mapping['cpt'])

        good_columns = [column for column in contents.keys() if self._is_cpt_column(column)]

        if len(good_columns) > 1:
            good_columns = [column for column in good_columns if re.search('cpt|hcpcs', column, re.I)]

        matches = []
        for column in good_columns:
            clean_value = self._get_cpt_value(contents[column])
            if clean_value:
                matches.append({column: clean_value})

        if len(matches) > 1:
            matches = matches[:1]

        if not matches:
            matches.extend(self._cpt_from_descriptions(descriptions))

        return matches and matches[0] or None

    def _is_cpt_column(self, column):
        if not any([re.search(regex, column, re.I) for regex in CPT_REGEXES]):
            return False
        elif any([re.search(regex, column, re.I) for regex in CPT_REGEXES_NEG + TRASH]):
            return False
        else:
            return True

    def _get_cpt_value(self, value):
        search = re.findall(r'\b((?:\w\d{3}\w)(?:[- ]\w\w)*)\b', unicode(value), re.UNICODE)
        if search and search[0][:5] in CPT_CODES_LIST:
            return search[0]
        else:
            return False

    def _cpt_from_descriptions(self, descriptions):
        matches = []
        for desc_dict in descriptions:
            column, desc = desc_dict.items()[0]
            cpts = re.findall('cpt.{0,7}(\w\d{3}\w)', unicode(desc), re.I)
            if cpts:
                matches.append({'From desc: ' + column: cpts[0]})

        return matches

    def _find_modifier(self, contents, cpt):
        matches = []
        for column, value in contents.iteritems():
            new = self._is_modifier(column, value)
            if new:
                matches.append(new)

        if not matches and cpt:
            from_cpt = self._modifier_from_cpt(cpt)
            from_cpt and matches.append(from_cpt)

        if len(matches) > 1:
            raise Exception('Found multiple modifiers %s: %s' % (contents, matches))

        return matches and matches[0] or None

    def _is_modifier(self, column, value):
        # check column
        if not any([re.search(regex, column, re.I) for regex in MODIFIER_REGEXES]):
            return False

        if any([re.search(regex, column, re.I) for regex in MODIFIER_REGEXES_NEG + TRASH]):
            return False

        # check and clean value
        search = re.findall(r'\b(?:\w\d{3}\w)[- ](\w\w)\b', unicode(value), re.U)
        if search and search[0] in ['22', '23', '24', '25', '26', '32', '33', '47', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '62', '63', '66', '76', '77', '78', '79', '80', '81', '82', '90', '91', '92', '99', 'P1', 'P2', 'P3', 'P4', 'P5', 'P6', '25', '27', '50', '52', '58', '59', '73', '74', '76', '77', '78', '79', '91', 'E1', 'E2', 'E3', 'E4', 'F1', 'F2', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'FA', 'GG', 'GH', 'LC', 'LD', 'LT', 'QM', 'QN', 'RC', 'RT', 'T1', 'T2', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9','TA']:
            return {column: search[0]}
        else:
            return False

    def _modifier_from_cpt(self, cpt):
        column, value = cpt.items()[0]
        search = re.findall(r'\b(?:\w\d{3}\w)-(\w\w)\b', unicode(value), re.U)
        if search:
            return {'From cpt: ' + column: search[0]}
        else:
            return None