Esempio n. 1
0
def all_prob_devid_given_deltas(filename, histograms, devid_logprobs,
                                questions, groups):
    result = {}
    start = time.time()
    for i, (test_id, samples) in enumerate(infra.readsamples(filename)):
        deltas = infra.unfiltered_deltas(infra.timestamps(samples))
        train_id = questions[test_id]
        result[test_id] = prob_devid_given_deltas(deltas, histograms,
                                                  devid_logprobs, train_id,
                                                  groups.get(train_id, {}))
        if i % 3000 == 0:
            print i, time.time() - start
    return result
Esempio n. 2
0
def get_labels(filename, questions):
    labels = [
        1 * (devid.split(':')[0] == questions[devid])
        for devid, samples in infra.readsamples(filename)
    ]
    return labels
Esempio n. 3
0
def get_features(filename,
                 questions,
                 model,
                 valids,
                 chains=None,
                 prob_scores=None,
                 test_summary=None,
                 test_resolutions=None,
                 test_min_value=None):
    test_summary = test_summary or infra.summarize_all_samples(filename)
    chains = chains or get_all_refined_chains(test_summary, questions)
    groups = group_train_ids(chains, questions)
    #prob_scores = prob_scores or all_prob_devid_given_deltas(filename, train_histograms, group_sample_probs, questions, groups)
    test_resolutions = test_resolutions or infra.summarize_all_samples(
        filename, lambda s: max(
            max((infra.resolution(x), infra.resolution(y), infra.resolution(z))
                ) for t, x, y, z, d in s))
    test_min_value = test_min_value or infra.summarize_all_samples(
        filename, lambda s: min(infra.min_abs_value(w) for w in s))
    print "total chains=", len(chains)

    train_last_timestamps, histograms, hours_histograms, train_logprobs, train_resolutions, train_min_value, train_values, train_counts = model

    test_first_timestamps = {
        devid: summary[-2]
        for devid, summary in test_summary.iteritems()
    }
    train_test_matches = match_test_to_train(train_last_timestamps,
                                             test_first_timestamps)

    chain_starters = {chain[0]: chain for chain in chains}
    train_test_matched_chains = {
        train_id: chain_starters[test_id]
        for test_id, train_id in train_test_matches.iteritems()
        if test_id in chain_starters
    }
    chains_by_devid = devid_to_chains(chains)
    groups = group_train_ids(chains, questions)

    features = []
    validity = []
    qids = []

    for devid, samples in infra.readsamples(filename):
        qids.append(devid)
        proposed_id = questions[devid]
        timestamps = [t for t, x, y, z, did in samples]
        histogram = histograms[proposed_id]
        chain = chains_by_devid[devid]
        chain_key = chain[0]

        values = set(abs(x) for t, x, y, z, d in samples) | set(
            abs(y) for t, x, y, z, d in samples) | set(
                abs(z) for t, x, y, z, d in samples)
        common_values = len(values & train_values[proposed_id])
        missing_values = len(values) - common_values
        total_values = len(values | train_values[proposed_id])
        missing_values_log_prob = train_counts[proposed_id] * log(
            (total_values - missing_values) / float(total_values))

        chain_associated = 0
        if train_test_matches.get(chain[0], None) == proposed_id:
            chain_associated = 1
        elif train_test_matches.get(chain[0], None):
            chain_associated = -1

        if proposed_id not in valids[devid]:
            validity.append(-1)
            features.append(None)
            continue
        elif len(valids[devid]) == 1:
            validity.append(2)
            features.append(None)
            continue
        else:
            validity.append(1)

        neg_resolution = min(
            train_resolutions[proposed_id] - test_resolutions[devid], 0)
        pos_resolution = max(
            train_resolutions[proposed_id] - test_resolutions[devid], 4)

        deltas = [t1 - t0 for t0, t1 in zip(timestamps, timestamps[1:])]
        hours_histogram = hours_histograms[proposed_id]
        hours = Counter(
            (infra.parse_timestamp(t).hour + 1) % 24 for t in timestamps)
        hour_prob = sum(((hours_histogram[hour] + 100.0) * count) /
                        (2400 + len(timestamps))
                        for hour, count in hours.iteritems())

        fellow_score = calc_fellow_score(chain, devid, proposed_id, questions,
                                         groups[proposed_id])
        prob_score = prob_scores[devid]
        prob_score = prob_score / 1000.0
        group_score = calc_group_score(groups[proposed_id], proposed_id,
                                       train_logprobs)
        min_value_ratio = max(
            train_min_value[proposed_id] / test_min_value[devid], 1)

        features.append((
            prob_score,
            fellow_score,
            log(hour_prob),
            1 * chain_associated,
            neg_resolution,
            pos_resolution,
            group_score,
            min_value_ratio,
            common_values,
            missing_values,
            missing_values_log_prob,
            train_counts[proposed_id],
        ))

    return qids, validity, features