Python probabilityの例、util.probability Pythonの例

コード例 #1

0

ファイルを表示

def get_probability():
    msg = request.args.get('message')
    if not title:
        return json.dumps({'error': 'no \'message\' argument given'})
    prob = util.probability(msg)
    response = {'message': msg, 'p': prob}
    return json.dumps(response)

コード例 #2

0

ファイルを表示

ファイル: api.py プロジェクト: jonahsmith/realtime-storytelling

def get_probability():
    msg = request.args.get("message")
    if not title:
        return json.dumps({"error": "no 'message' argument given"})
    prob = util.probability(msg)
    response = {"message": msg, "p": prob}
    return json.dumps(response)

コード例 #3

0

ファイルを表示

ファイル: github_reports.py プロジェクト: Khan/beep-boop

def main():
    try:
        exercise_file = open(util.relative_path("exercise_reports"), 'r+')
        ex_reports = json.loads(exercise_file.read())
    except IOError:
        exercise_file = open(util.relative_path("exercise_reports"), 'w')
        ex_reports = {"elapsed_time": 1,  # Filler value
                      "max_id": -1,
                      "last_time": 0}

    new_reports = get_errors(copy.deepcopy(ex_reports))

    period_len = new_reports["time_this_period"]

    for ex in new_reports:
        if ex in SPECIAL_VALUES:
            continue

        if ex in ex_reports and ex_reports[ex]["num_errors"] > 0:
            errors_this_period = new_reports[ex]["this_period"]

            mean, probability = util.probability(ex_reports[ex]["num_errors"],
                                                 ex_reports["elapsed_time"],
                                                 errors_this_period,
                                                 period_len)

            print ("%s] TOTAL %s/%ss; %s-: %s/%ss; m=%.3f p=%.3f"
                   % (time.strftime("%Y-%m-%d %H:%M:%S %Z"),
                      ex_reports[ex]["num_errors"], ex_reports["elapsed_time"],
                      ex_reports["last_time"],
                      errors_this_period, period_len,
                      mean, probability))

            if (probability > 0.997 and errors_this_period > 1):
                util.send_to_slack(
                    "*Elevated exercise bug report rate in exercise `%s`\n"
                    "Reports: %s.  We saw %s in the last %s minutes,"
                    " while the mean indicates we should see around %s."
                    " *Probability that this is abnormally elevated: %.4f.*"
                    % (ex,
                       generate_slack_links(new_reports[ex]["href"]),
                       util.thousand_commas(errors_this_period),
                       util.thousand_commas(int(period_len / 60)),
                       util.thousand_commas(round(mean, 2)),
                       probability),
                    channel="#support")
        if "href" in new_reports[ex].keys():
            del new_reports[ex]["href"]  # don't need to keep the links around

    del new_reports["time_this_period"]
    # Overwrite with new contents
    exercise_file.seek(0)
    exercise_file.truncate()
    exercise_file.write(json.dumps(new_reports))

    exercise_file.close()

コード例 #4

0

ファイルを表示

def main():
    try:
        exercise_file = open(util.relative_path("exercise_reports"), 'r+')
        ex_reports = json.loads(exercise_file.read())
    except IOError:
        exercise_file = open(util.relative_path("exercise_reports"), 'w')
        ex_reports = {
            "elapsed_time": 1,  # Filler value
            "max_id": -1,
            "last_time": 0
        }

    new_reports = get_errors(copy.deepcopy(ex_reports))

    period_len = new_reports["time_this_period"]

    for ex in new_reports:
        if ex in SPECIAL_VALUES:
            continue

        if ex in ex_reports and ex_reports[ex]["num_errors"] > 0:
            errors_this_period = new_reports[ex]["this_period"]

            mean, probability = util.probability(ex_reports[ex]["num_errors"],
                                                 ex_reports["elapsed_time"],
                                                 errors_this_period,
                                                 period_len)

            print("%s] TOTAL %s/%ss; %s-: %s/%ss; m=%.3f p=%.3f" %
                  (time.strftime("%Y-%m-%d %H:%M:%S %Z"),
                   ex_reports[ex]["num_errors"], ex_reports["elapsed_time"],
                   ex_reports["last_time"], errors_this_period, period_len,
                   mean, probability))

            if (probability > 0.997 and errors_this_period > 1):
                util.send_to_slack(
                    "*Elevated exercise bug report rate in exercise `%s`\n"
                    "Reports: %s.  We saw %s in the last %s minutes,"
                    " while the mean indicates we should see around %s."
                    " *Probability that this is abnormally elevated: %.4f.*" %
                    (ex, generate_slack_links(new_reports[ex]["href"]),
                     util.thousand_commas(errors_this_period),
                     util.thousand_commas(int(period_len / 60)),
                     util.thousand_commas(round(mean, 2)), probability),
                    channel="#support")
        if "href" in new_reports[ex].keys():
            del new_reports[ex]["href"]  # don't need to keep the links around

    del new_reports["time_this_period"]
    # Overwrite with new contents
    exercise_file.seek(0)
    exercise_file.truncate()
    exercise_file.write(json.dumps(new_reports))

    exercise_file.close()

コード例 #5

0

ファイルを表示

ファイル: github_reports.py プロジェクト: svasanthme/beep-boop

def main():
    try:
        exercise_file = open(util.relative_path("exercise_reports"), 'r+')
        ex_reports = json.loads(exercise_file.read())
    except IOError:
        exercise_file = open(util.relative_path("exercise_reports"), 'w')
        ex_reports = {"elapsed_time": 1,  # Filler value
                      "max_id": -1,
                      "last_time": 0}

    new_reports = get_errors(copy.deepcopy(ex_reports))

    period_len = new_reports["time_this_period"]

    for ex in new_reports:
        if ex in SPECIAL_VALUES:
            continue

        if ex in ex_reports and ex_reports[ex]["num_errors"] > 0:
            errors_this_period = new_reports[ex]["this_period"]

            mean, probability = util.probability(ex_reports[ex]["num_errors"],
                                                 ex_reports["elapsed_time"],
                                                 errors_this_period,
                                                 period_len)

            if (probability > 0.997 and errors_this_period > 1):
                # Too many errors!
                hipchat_message.send_message(
                    "Elevated exercise bug report rate in exercise %s!"
                    " Reports: %s.  We saw %s in the last %s minutes,"
                    " while the mean indicates we should see around %s."
                    " Probability that this is abnormally elevated: %.4f."
                        % (ex,
                           generate_links(new_reports[ex]["href"]),
                           util.thousand_commas(errors_this_period),
                           util.thousand_commas(int(period_len / 60)),
                           util.thousand_commas(round(mean, 2)),
                           probability),
                    room_id="Exercises")
        if "href" in new_reports[ex].keys():
            del new_reports[ex]["href"]  # don't need to keep the links around

    del new_reports["time_this_period"]
    # Overwrite with new contents
    exercise_file.seek(0)
    exercise_file.truncate()
    exercise_file.write(json.dumps(new_reports))

    exercise_file.close()

コード例 #6

0

ファイルを表示

ファイル: google_code_reports.py プロジェクト: svasanthme/beep-boop

def main():
    try:
        google_code_file = open(util.relative_path("google_code"), 'r+')
        old_reports = json.loads(google_code_file.read())
    except IOError:
        google_code_file = open(util.relative_path("google_code"), 'w')
        # elapsed_time is filler value: doesn't matter what it is
        # since issue_count is 0.
        old_reports = {"elapsed_time": 1,
                       "last_id": -1,
                       "issue_count": 0,
                       "last_time": 0}

    new_reports = get_errors(copy.deepcopy(old_reports))

    time_this_period = new_reports["time_this_period"]

    mean, probability = util.probability(old_reports["issue_count"],
                                         old_reports["elapsed_time"],
                                         new_reports["issues_this_period"],
                                         time_this_period)

    if (mean != 0 and probability > 0.99):
        # Too many errors!
        hipchat_message.send_message(
            "Elevated bug report rate on"
            " <a href='http://khanacademy.org/r/bugs'>Google"
            " code!</a>"
            " We saw %s in the last %s minutes,"
            " while the mean indicates we should see around %s."
            " Probability that this is abnormally elevated: %.4f."
            % (util.thousand_commas(new_reports["issues_this_period"]),
               util.thousand_commas(int(time_this_period / 60)),
               util.thousand_commas(round(mean, 2)),
               probability))

    # Delete fields we don't need anymore
    del(new_reports["issues_this_period"])
    del(new_reports["time_this_period"])

    google_code_file.seek(0)
    google_code_file.truncate()
    google_code_file.write(json.dumps(new_reports))

    google_code_file.close()

コード例 #7

0

ファイルを表示

ファイル: zendesk_reports.py プロジェクト: Khan/beep-boop

def main():
    try:
        zendesk_status_file = util.relative_path("zendesk")
        with open(zendesk_status_file) as f:
            old_data = cPickle.load(f)
    except (IOError, EOFError):
        old_data = {"elapsed_time_weekday": 0.0001,   # avoid a divide-by-0
                    "elapsed_time_weekend": 0.0001,   # avoid a divide-by-0
                    "ticket_count_weekday": 0,
                    "ticket_count_weekend": 0,
                    "last_time_t": None,
                    "last_time_t_weekday": None,
                    "last_time_t_weekend": None,
                    }

    # We compare the number of tickets in the last few minutes against
    # the historical average for all time.  But we don't start "all
    # time" at AD 1, we start it a week ago.  Longer than that and it
    # takes forever due to quota issues.  That's still plenty of
    # historical data. :-)
    #
    # Zendesk seems to wait 5 minutes to update API data :-(, so we
    # ask for data that's a bit time-lagged
    end_time = int(time.time()) - 300
    start_time = old_data['last_time_t']

    # Set flag to track if current time period is a weekend. Separate
    # ticket_count/elapsed_time stats are kept for weekend vs. weekday
    # to improve sensitivity to increases during low-traffic periods
    is_off_hours = _is_off_hours(datetime.datetime.fromtimestamp(end_time))

    (new_tickets, oldest_ticket_time_t) = get_tickets_between(
        start_time or (end_time - 86400 * 7), end_time)
    num_new_tickets = len(new_tickets)

    # The first time we run this, we take the starting time to be the
    # time of the first bug report.

    if start_time is None:
        start_time = oldest_ticket_time_t

    time_this_period = end_time - start_time

    if is_off_hours:
        # To simplify backcompat we still use "weekend" and "weekday" in the
        # saved data; really they mean "on hours" and "off hours" now.
        ticket_count = old_data['ticket_count_weekend']
        elapsed_time = old_data['elapsed_time_weekend']
    else:
        ticket_count = old_data['ticket_count_weekday']
        elapsed_time = old_data['elapsed_time_weekday']

    (mean, probability) = util.probability(ticket_count,
                                           elapsed_time,
                                           num_new_tickets,
                                           time_this_period)

    print ("%s] TOTAL %s/%ss; %s-: %s/%ss; m=%.3f p=%.3f"
           % (time.strftime("%Y-%m-%d %H:%M:%S %Z"),
              ticket_count, int(elapsed_time),
              start_time,
              num_new_tickets, time_this_period,
              mean, probability))

    handle_alerts(new_tickets, time_this_period, mean, probability,
                  start_time, end_time)

    if is_off_hours:
        new_data = {"elapsed_time_weekend": (
                        old_data["elapsed_time_weekend"] + time_this_period),
                    "ticket_count_weekend": (
                        old_data["ticket_count_weekend"] + num_new_tickets),
                    "elapsed_time_weekday": old_data["elapsed_time_weekday"],
                    "ticket_count_weekday": old_data["ticket_count_weekday"],
                    }
    else:
        new_data = {"elapsed_time_weekend": old_data["elapsed_time_weekend"],
                    "ticket_count_weekend": old_data["ticket_count_weekend"],
                    "elapsed_time_weekday": (
                        old_data["elapsed_time_weekday"] + time_this_period),
                    "ticket_count_weekday": (
                        old_data["ticket_count_weekday"] + num_new_tickets),
                    }

    new_data['last_time_t'] = end_time

    with open(zendesk_status_file, 'w') as f:
        cPickle.dump(new_data, f)

コード例 #8

0

ファイルを表示

THRESH = 0.05

# Repeat indefinitely
while 1:
    # Capture the input from stdin, which is the stream of data from ingest.py.
    # It is dumped as JSON, so decode it.
    line = stdin.readline()
    edit = json.loads(line)

    # We are extracting the 'wiki' key, which is a unique identifier for the
    # Wikipedia that was edited.
    message = edit.get('wiki')

    # I have written a function in util that gets the probability of a
    # particular message, given the entries in the Redis database.
    prob = util.probability(message)

    # If the probability falls below our threshold, emit a message. Otherwise,
    # loop around.
    if prob < THRESH:
        # This schema (particularly the 'unlikely_message' type) is understood
        # by the slack.py file, which sends the appropriate alerts.
        alert = {
                  'type': 'unlikely_message',
                  'message': message,
                  'prob': prob
                }
        # Print the alert to stdout and flush stdout to prevent message delay
        # from buffering.
        print(json.dumps(alert))
        stdout.flush()

コード例 #9

0

ファイルを表示

ファイル: get_entropy_micro.py プロジェクト: laporastrelli/Final_Year_Project

                n_PC) + 'PC/control/'
            name_out = mode + '_control_' + 'traj_' + str(traj_n)

            plt.savefig(path_out + name_out + ".png")
            plt.show()

            ########## Retrieve maxiumum number of elements based on a raw decimal discretization ##########
            data = mPE_vector.flatten()
            kmeans = KMeans(n_clusters=20).fit(data.reshape(-1, 1))
            kmeans.predict(data.reshape(-1, 1))
            centroids = kmeans.cluster_centers_
            centroids = [centroids[i] for i in range(len(centroids))]
            centroids = np.asarray(centroids)

            ########## Get probability vectors ##########
            prob1 = probability(mPE_vector[0, :, 0], centroids)
            prob2 = probability(mPE_vector[1, :, 0], centroids)

            ########## Evaluation ##########
            significance_lev = significance_level
            js_distance = distance.jensenshannon(prob1, prob2)
            [_, p_value] = stats.ks_2samp(mPE_vector[0, :, 0], mPE_vector[1, :,
                                                                          0])

            if p_value > 0.8:
                too_low = False
                break

            measures[1, 0] = p_value
            measures[2, 0] = js_distance[0]

コード例 #10

0

ファイルを表示

def main():
    try:
        jira_status_file = util.relative_path('jira')
        with open(jira_status_file) as f:
            old_data = cPickle.load(f)
    except IOError:
        old_data = {'elapsed_times': {},
                    'ticket_counts': collections.defaultdict(int),
                    'last_time_t': None,
                    }

    # We compare the number of tickets in the last few minutes against
    # the historical average for all time. But we don't start "all
    # time" at AD 1, we start it 100 days ago.
    # Note: this is a way wider window than we use for Zendesk, but we're
    # making exercise-specific recommendations, so we need more data.
    now = int(time.time())
    num_days_in_past = 100
    (num_new_tickets, oldest_ticket_time_t) = num_tickets_between(
        old_data['last_time_t'] or (now - 86400 * num_days_in_past), now)

    # Elapsed time is computed per-exercise, so store values as we go.
    # We use a copy so that exercises that don't appear as new tickets still
    # have their old elapsed times preserved.
    elapsed_times = copy.copy(old_data['elapsed_times'])
    for exercise in num_new_tickets:
        # If this is the first time we're running, we don't have a last_time_t,
        # so we take the oldest ticket for each exercise as its last_time_t
        last_time_t = old_data['last_time_t'] or oldest_ticket_time_t[exercise]
        time_this_period = now - last_time_t
        # Avoid divide-by-0 if this is the first time we've seen an exercise
        time_last_period = old_data['elapsed_times'].get(exercise, 0.0001)

        num_old_tickets_for_exercise = old_data['ticket_counts'][exercise]
        num_new_tickets_for_exercise = num_new_tickets[exercise]
        (mean, probability) = util.probability(num_old_tickets_for_exercise,
                                               time_last_period,
                                               num_new_tickets_for_exercise,
                                               time_this_period)

        print('%s] %s TOTAL %s/%ss; %s-: %s/%ss; m=%.3f p=%.3f'
              % (time.strftime('%Y-%m-%d %H:%M:%S %Z'),
                  exercise,
                  num_old_tickets_for_exercise, int(time_last_period),
                  last_time_t,
                  num_new_tickets_for_exercise, time_this_period,
                  mean, probability))

        if (mean != 0 and probability > 0.9995 and
                num_new_tickets_for_exercise > THRESHOLD):
            quoted = urllib.quote(exercise.encode("utf-8"))
            ka_url = "https://khanacademy.org/e/%s" % quoted
            jira_url = "https://khanacademy.atlassian.net/browse/AI-941528?jql=Exercise%%20%%3D%%20%s" % quoted
            util.send_to_slack(
                "*Elevated bug report rate on exercise `%s`*\n"
                "We saw %s in the last %s minutes,"
                " while the mean indicates we should see around %s."
                " *Probability that this is abnormally elevated: %.4f.*\n"
                " Links: <%s|exercise on Khan Academy>, <%s|JIRA tickets>."
                % (exercise,
                   util.thousand_commas(num_new_tickets_for_exercise),
                   util.thousand_commas(int(time_this_period / 60)),
                   util.thousand_commas(round(mean, 2)),
                   probability,
                   ka_url,
                   jira_url),
                channel='#content-beep-boop')
        elapsed_times[exercise] = time_last_period + time_this_period

    new_ticket_counts = util.merge_int_dicts(old_data['ticket_counts'],
                                             num_new_tickets)
    new_data = {'elapsed_times': elapsed_times,
                'ticket_counts': new_ticket_counts,
                'last_time_t': now,
                }
    with open(jira_status_file, 'w') as f:
        cPickle.dump(new_data, f)

コード例 #11

0

ファイルを表示

ファイル: zendesk_reports.py プロジェクト: vbisrikkanth/beep-boop

def main():
    try:
        zendesk_status_file = util.relative_path("zendesk")
        with open(zendesk_status_file) as f:
            old_data = cPickle.load(f)
    except IOError:
        old_data = {"elapsed_time": 0.0001,   # avoid a divide-by-0
                    "ticket_count": 0,
                    "last_time_t": None,
                    }

    # We compare the number of tickets in the last few minutes against
    # the historical average for all time.  But we don't start "all
    # time" at AD 1, we start it a week ago.  Longer than that and it
    # takes forever due to quota issues.  That's still plenty of
    # historical data. :-)
    #
    # Zendesk seems to wait 5 minutes to update API data :-(, so we
    # ask for data that's a bit time-lagged
    end_time = int(time.time()) - 300
    start_time = old_data['last_time_t']
    
    # Set flag to track if current time period is a weekend. Separate
    # ticket_count/elapsed_time stats are kept for weekend vs. weekday
    # to improve sensitivity to increases during low-traffic periods
    is_weekend = time.localtime().tm_wday in [5, 6]

    (num_new_tickets, oldest_ticket_time_t) = num_tickets_between(
        start_time or (end_time - 86400 * 7), end_time)

    # The first time we run this, we take the starting time to be the
    # time of the first bug report.

    if start_time is None:
        start_time = oldest_ticket_time_t

    time_this_period = end_time - start_time

    # To handle transition from unsegmented to segmented data, below sets
    # the weekend data to mirror the stats from the past 4 months of logs
    # to calculate a mean, and shifts all historical data to the weekday
    # data points. This will result in some inaccuracy, but the weekend
    # data should skew the weekday data only negligably. May cause some
    # skewed alerting during the transition period.
    # TODO(jacqueline): Remove this transition code after August 2017
    if 'elapsed_time' in old_data:
        old_data['ticket_count_weekday'] = old_data['ticket_count']
        old_data['ticket_count_weekend'] = 555
        old_data['elapsed_time_weekday'] = old_data['elapsed_time']
        old_data['elapsed_time_weekend'] = 2921756.0001

    if is_weekend is True:
        ticket_count = old_data['ticket_count_weekend']
        elapsed_time = old_data['elapsed_time_weekend']
    else:
        ticket_count = old_data['ticket_count_weekday']
        elapsed_time = old_data['elapsed_time_weekday']

    (mean, probability) = util.probability(ticket_count,
                                           elapsed_time,
                                           num_new_tickets,
                                           time_this_period)

    print ("%s] TOTAL %s/%ss; %s-: %s/%ss; m=%.3f p=%.3f"
           % (time.strftime("%Y-%m-%d %H:%M:%S %Z"),
              ticket_count, int(elapsed_time),
              start_time,
              num_new_tickets, time_this_period,
              mean, probability))

    handle_alerts(num_new_tickets, time_this_period, mean, probability,
                  start_time, end_time)

    if is_weekend is True:
        new_data = {"elapsed_time_weekend": (
                        old_data["elapsed_time_weekend"] + time_this_period),
                    "ticket_count_weekend": (
                        old_data["ticket_count_weekend"] + num_new_tickets),
                    "elapsed_time_weekday": old_data["elapsed_time_weekday"],
                    "ticket_count_weekday": old_data["ticket_count_weekday"],
                    }
    else:
        new_data = {"elapsed_time_weekend": old_data["elapsed_time_weekend"],
                    "ticket_count_weekend": old_data["ticket_count_weekend"],
                    "elapsed_time_weekday": (
                        old_data["elapsed_time_weekday"] + time_this_period),
                    "ticket_count_weekday": (
                        old_data["ticket_count_weekday"] + num_new_tickets),
                    }

    new_data['last_time_t'] = end_time

    with open(zendesk_status_file, 'w') as f:
        cPickle.dump(new_data, f)