def calculate_merge_actions(source_entries, dest_entries, revisions): actions = iter_merge_actions_without_moves( source_entries, dest_entries, revisions) action_by_type = groupby(actions, MergeAction.get_type) touches, copies, moves, deletes, undeletes, updates, uphists, conflicts = \ (action_by_type.get(type, []) for type in MergeActionType) moves = [] unmoved_copies = [] # If a copy also has a matching delete, make it as "move". deletes_by_hash = groupby(deletes, \ lambda delete: delete.older.hash if delete.older else None) for action in copies: deletes_of_hash = deletes_by_hash.get(action.newer.hash, []) if action.newer.hash and deletes_of_hash: # Pop so we only match a given delete once. But we # leave the deleted in the actions so that it's put # in the history and merge data, but we don't put it # in the revisions. delete = deletes_of_hash.pop() moves.append(action.alter( type = MergeActionType.move, details = delete.older)) else: unmoved_copies.append(action) copies = unmoved_copies return (touches, copies, moves, deletes, undeletes, updates, uphists, conflicts)
def gen_week_skillsets(xml): # returns an integer week from 0-51 def week_from_score(score) -> int: datetime = parsedate(score.findtext("DateTime")) week = datetime.isocalendar()[1] return week chronological_scores = sorted(iter_scores(xml), key=lambda s: s.findtext("DateTime")) week_start_datetimes: List[datetime] = [] diffsets: List[List[float]] = [] for week, scores_in_week in util.groupby(chronological_scores, week_from_score): diffset = [0, 0, 0, 0, 0, 0, 0] for score in scores_in_week: skillset_ssrs = score.find("SkillsetSSRs") if skillset_ssrs is None: continue diffs = [float(diff.text) for diff in skillset_ssrs[1:]] main_diff = diffs.index(max(diffs)) diffset[main_diff] += 1 total = sum(diffset) if total == 0: continue diffset = [diff / total * 100 for diff in diffset] year = scores_in_week[0].findtext("DateTime")[:4] week_start_datetime = datetime.strptime(f"{year} {week} {0}", "%Y %W %w") diffsets.append(diffset) week_start_datetimes.append(week_start_datetime) return (week_start_datetimes, diffsets)
def aggregate_timedeltas(self, col_ids, aggr_func=None): """ col_ids is the list of column indices that should be aggregated. The aggregation function can be specified, but is otherwise sum(), and always acts over the time columns. Please note that index numbers follow this order: id, resource_type, resource_id, operation, username, time_started, time_ended """ if aggr_func is None: aggr_func = ASADataSet.sum_timedeltas def set_keys(*indices): """Returns a function that returns a tuple of key values""" def get_keys(seq, indices=indices): keys = [] for i in indices: keys.append(seq[i]) return tuple(keys) return get_keys keyfunc = set_keys(*col_ids) aggregated = [] for k,v in groupby(self.data, key=keyfunc): aggregated.append(tuple(list(k) + [aggr_func(v)])) return ASADataSet( ['resource_type', 'operation', 'username', 'durantion'], aggregated)
def aggregate_timedeltas(self, col_ids, aggr_func=None): """ col_ids is the list of column indices that should be aggregated. The aggregation function can be specified, but is otherwise sum(), and always acts over the time columns. Please note that index numbers follow this order: id, resource_type, resource_id, operation, username, time_started, time_ended """ if aggr_func is None: aggr_func = ASADataSet.sum_timedeltas def set_keys(*indices): """Returns a function that returns a tuple of key values""" def get_keys(seq, indices=indices): keys = [] for i in indices: keys.append(seq[i]) return tuple(keys) return get_keys keyfunc = set_keys(*col_ids) aggregated = [] for k, v in groupby(self.data, key=keyfunc): aggregated.append(tuple(list(k) + [aggr_func(v)])) return ASADataSet( ['resource_type', 'operation', 'username', 'durantion'], aggregated)
def earlyorder(*goals): """ Reorder goals to avoid EarlyGoalErrors All goals are evaluated. Those that raise EarlyGoalErrors are placed at the end in a lallearly See also: EarlyGoalError """ groups = groupby(earlysafe, goals) good = groups.get(True, []) bad = groups.get(False, []) if not good: raise EarlyGoalError() elif not bad: return tuple(good) else: return tuple(good) + ((lallearly, ) + tuple(bad), )
def earlyorder(*goals): """ Reorder goals to avoid EarlyGoalErrors All goals are evaluated. Those that raise EarlyGoalErrors are placed at the end in a lallearly See also: EarlyGoalError """ groups = groupby(earlysafe, goals) good = groups.get(True, []) bad = groups.get(False, []) if not good: raise EarlyGoalError() elif not bad: return tuple(good) else: return tuple(good) + ((lallearly,) + tuple(bad),)
def _transpose_activity_times(self, activity_times): def get_duration_for_user_and_activity(user, activity_type): for row in activity_times: if row[0] == user and row[1] == activity_type: return row[2] return timedelta(0) blanked_and_ordered_activity_times = [] import numpy as np for user in set(np.array(activity_times)[:, 0]): # unique users for activity_type in Measurements.to_list(): blanked_and_ordered_activity_times.append([ user, activity_type, get_duration_for_user_and_activity(user, activity_type) ]) transposed_activity_times = [ [user] + np.array(list(row)).transpose().tolist()[2:][0] for user, row in groupby(blanked_and_ordered_activity_times, lambda x: x[0]) ] return transposed_activity_times
def _transpose_activity_times(self, activity_times): def get_duration_for_user_and_activity(user, activity_type): for row in activity_times: if row[0] == user and row[1] == activity_type: return row[2] return timedelta(0) blanked_and_ordered_activity_times = [] import numpy as np for user in set(np.array(activity_times)[:,0]): # unique users for activity_type in Measurements.to_list(): blanked_and_ordered_activity_times.append([user, activity_type, get_duration_for_user_and_activity(user, activity_type)]) transposed_activity_times = [[user] + np.array(list(row)).transpose().tolist()[2:][0] for user,row in groupby(blanked_and_ordered_activity_times, lambda x: x[0])] return transposed_activity_times
def group_history_by_peerid(entries): return groupby(entries, operator.itemgetter(1), into=History)
def group_history_by_gpath(entries): # TODO: Make faster (give entry a cached gpath)? return groupby(entries, HistoryEntry.get_gpath, into=History)
def format_top_features(args): # Get list of features, w other data tables = util.get_tables_to_join() all_results = [] for tab in tables: print "processing table ", tab try: columns = [col for col, tp in util.get_table_schema_w_types(tab)] fname = "%s_w_nonnull_counts.csv" % tab for l in open(fname,'r').readlines(): parts = l.strip().split(util.DELIM) date, pfcode, bootstrap_sample = parts[0], parts[1], parts[2] cors_and_nonnull_counts = parts[3:] cors = [str_to_corr(x) for x in cors_and_nonnull_counts[0::2]] nonnull_fail_counts = [ (int(x) if x.isdigit() else 0) for x in cors_and_nonnull_counts[1::2]] new_results = [ {"date": date, "product": util.PRODUCT, "metric": util.FEATURE_WEIGHT_METRIC, "bootstrap_sample": bootstrap_sample, "pfcode": pfcode, "table": tab, "feature": feature, "weight": weight, "n_nonnull_fails": n_nonnull_fails, } for feature, weight, n_nonnull_fails in zip(columns, cors, nonnull_fail_counts) ] new_valid_results = [r for r in new_results if r["weight"] != util.NULL_WEIGHT and r["weight"]<util.MAX_WEIGHT and r["n_nonnull_fails"]>=util.MIN_N_NONNULL_DATAPOINTS ] del new_results all_results.extend(new_valid_results) except: print "failed" continue valid_results = all_results print "len valid", len(valid_results) if not util.INCLUDE_PFCODE_OTHER: valid_results = [r for r in valid_results if r["pfcode"] != util.OTHER_PFCODE] # Aggregate across each bootstrap sample to get aggregate weight by_pfcode_date_table_feature = util.groupby(valid_results, ["pfcode", "date", "table", "feature"]) agg_results = {} for k, recs in by_pfcode_date_table_feature.items(): agg_weight = aggregate_cors([r["weight"] for r in recs]) first_rec = recs[0].copy() first_rec["weight"] = agg_weight agg_results[k] = first_rec # Group features by date/pfcode and take top 20 by_pfcode_date = util.groupby(agg_results.values(), ["pfcode", "date"]) output_results = [] top_errors = set(util.get_top_errors(args)) for key, lst in by_pfcode_date.items(): pf, date = key if pf not in top_errors: continue lst.sort(key = lambda r: str_to_corr(r["weight"]), reverse=True) ln = min(len(lst), args.num_top_features) output_results.extend(lst[:ln]) split_output_lines = [ [str(record[c]) for c in util.TOP_FEATURES_SCHEMA] for record in output_results] output_lines = [util.DELIM.join(l) for l in split_output_lines] open(util.TOP_FEATURES_FNAME,"w").write("\n".join(output_lines))