def train(filename): histograms = infra.summarize_all_samples(filename, infra.histogram_samples) hour_histograms = infra.summarize_all_samples(filename, infra.histogram_hours) last_timestamps = infra.summarize_all_samples( filename, lambda samples: samples[-1][0]) histograms = { key: SmoothHistogram(histogram) for key, histogram in histograms.iteritems() } train_counts = infra.summarize_all_samples(filename, lambda s: len(s)) total_train_counts = sum(train_counts.values()) train_logprobs = { devid: log(count) - log(total_train_counts) for devid, count in train_counts.iteritems() } resolutions = infra.summarize_all_samples( filename, lambda s: max( max((infra.resolution(x), infra.resolution(y), infra.resolution(z)) ) for t, x, y, z, d in s)) train_min_value = infra.summarize_all_samples( filename, lambda s: min(infra.min_abs_value(w) for w in s)) values = infra.summarize_all_samples( filename, lambda s: set(abs(x) for t, x, y, z, d in s) | set( abs(y) for t, x, y, z, d in s) | set(abs(z) for t, x, y, z, d in s)) return (last_timestamps, histograms, hour_histograms, train_logprobs, resolutions, train_min_value, values, train_counts)
def get_features(filename, questions, model, valids, chains=None, prob_scores=None, test_summary=None, test_resolutions=None, test_min_value=None): test_summary = test_summary or infra.summarize_all_samples(filename) chains = chains or get_all_refined_chains(test_summary, questions) groups = group_train_ids(chains, questions) #prob_scores = prob_scores or all_prob_devid_given_deltas(filename, train_histograms, group_sample_probs, questions, groups) test_resolutions = test_resolutions or infra.summarize_all_samples( filename, lambda s: max( max((infra.resolution(x), infra.resolution(y), infra.resolution(z)) ) for t, x, y, z, d in s)) test_min_value = test_min_value or infra.summarize_all_samples( filename, lambda s: min(infra.min_abs_value(w) for w in s)) print "total chains=", len(chains) train_last_timestamps, histograms, hours_histograms, train_logprobs, train_resolutions, train_min_value, train_values, train_counts = model test_first_timestamps = { devid: summary[-2] for devid, summary in test_summary.iteritems() } train_test_matches = match_test_to_train(train_last_timestamps, test_first_timestamps) chain_starters = {chain[0]: chain for chain in chains} train_test_matched_chains = { train_id: chain_starters[test_id] for test_id, train_id in train_test_matches.iteritems() if test_id in chain_starters } chains_by_devid = devid_to_chains(chains) groups = group_train_ids(chains, questions) features = [] validity = [] qids = [] for devid, samples in infra.readsamples(filename): qids.append(devid) proposed_id = questions[devid] timestamps = [t for t, x, y, z, did in samples] histogram = histograms[proposed_id] chain = chains_by_devid[devid] chain_key = chain[0] values = set(abs(x) for t, x, y, z, d in samples) | set( abs(y) for t, x, y, z, d in samples) | set( abs(z) for t, x, y, z, d in samples) common_values = len(values & train_values[proposed_id]) missing_values = len(values) - common_values total_values = len(values | train_values[proposed_id]) missing_values_log_prob = train_counts[proposed_id] * log( (total_values - missing_values) / float(total_values)) chain_associated = 0 if train_test_matches.get(chain[0], None) == proposed_id: chain_associated = 1 elif train_test_matches.get(chain[0], None): chain_associated = -1 if proposed_id not in valids[devid]: validity.append(-1) features.append(None) continue elif len(valids[devid]) == 1: validity.append(2) features.append(None) continue else: validity.append(1) neg_resolution = min( train_resolutions[proposed_id] - test_resolutions[devid], 0) pos_resolution = max( train_resolutions[proposed_id] - test_resolutions[devid], 4) deltas = [t1 - t0 for t0, t1 in zip(timestamps, timestamps[1:])] hours_histogram = hours_histograms[proposed_id] hours = Counter( (infra.parse_timestamp(t).hour + 1) % 24 for t in timestamps) hour_prob = sum(((hours_histogram[hour] + 100.0) * count) / (2400 + len(timestamps)) for hour, count in hours.iteritems()) fellow_score = calc_fellow_score(chain, devid, proposed_id, questions, groups[proposed_id]) prob_score = prob_scores[devid] prob_score = prob_score / 1000.0 group_score = calc_group_score(groups[proposed_id], proposed_id, train_logprobs) min_value_ratio = max( train_min_value[proposed_id] / test_min_value[devid], 1) features.append(( prob_score, fellow_score, log(hour_prob), 1 * chain_associated, neg_resolution, pos_resolution, group_score, min_value_ratio, common_values, missing_values, missing_values_log_prob, train_counts[proposed_id], )) return qids, validity, features