def run(self, data_reader, odir, n_return, penalty_mode): log_handle, log_system = logger(odir) saliences_pen = None for hour, labels, unicodes, saliences, X in data_reader: time_int = hour_str2datetime_interval(hour) n_points_total = X.shape[0] ### DEDUP ### I = self.unique_indices(unicodes, saliences) unicodes = unicodes[I] saliences = saliences[I] X = X[I,:] labels = [labels[idx] for idx in I] n_points = X.shape[0] print "System time: {} -- {}".format(time_int.start, time_int.stop) print "Received {} unique sentences from {} total".format( n_points, n_points_total) if self.use_temp_ is True and self.n_updates_ > 0: saliences_pen = self.penalize_salience( saliences, X, penalty_mode) ranks = saliences_pen else: ranks = saliences sorted_idxs = sorted(range(n_points), key=lambda x: ranks[x], reverse=True) update_idxs = sorted_idxs[0:n_return] for e in update_idxs: if saliences_pen is not None: print saliences[e], saliences_pen[e], unicodes[e].encode(u'utf-8') else: print saliences[e], unicodes[e].encode(u'utf-8') print self.add_updates( update_idxs, time_int, labels, unicodes, saliences, saliences_pen, X) self.write_iterative_summary( odir, time_int.stop.strftime(u'%Y-%m-%d-%H')) log_system( sorted_idxs, time_int, labels, unicodes, saliences, saliences_pen) self.write_updates(odir) log_handle.close()
def run(self, data_reader, odir, n_return, penalty_mode): log_handle, log_system = logger(odir) saliences_pen = None for hour, labels, unicodes, saliences, X in data_reader: time_int = hour_str2datetime_interval(hour) n_points_total = X.shape[0] ### DEDUP ### I = self.unique_indices(unicodes, saliences) unicodes = unicodes[I] saliences = saliences[I] X = X[I, :] labels = [labels[idx] for idx in I] n_points = X.shape[0] print "System time: {} -- {}".format(time_int.start, time_int.stop) print "Received {} unique sentences from {} total".format( n_points, n_points_total) if self.use_temp_ is True and self.n_updates_ > 0: saliences_pen = self.penalize_salience(saliences, X, penalty_mode) ranks = saliences_pen else: ranks = saliences sorted_idxs = sorted(range(n_points), key=lambda x: ranks[x], reverse=True) update_idxs = sorted_idxs[0:n_return] for e in update_idxs: if saliences_pen is not None: print saliences[e], saliences_pen[e], unicodes[e].encode( u'utf-8') else: print saliences[e], unicodes[e].encode(u'utf-8') print self.add_updates(update_idxs, time_int, labels, unicodes, saliences, saliences_pen, X) self.write_iterative_summary( odir, time_int.stop.strftime(u'%Y-%m-%d-%H')) log_system(sorted_idxs, time_int, labels, unicodes, saliences, saliences_pen) self.write_updates(odir) log_handle.close()
def run(self, data_reader, odir, penalty_mode, scale=1.0, repulsion=1.0, update_cutoff=1.0, update_sim_threshold=.75): stdsclr = StandardScaler() log_handle, log_system = logger(odir) saliences_pen = None for hour, labels, unicodes, saliences, X, in data_reader: time_int = hour_str2datetime_interval(hour) n_points_total = X.shape[0] # I = self.simple_filter(unicodes) # unicodes = unicodes[I] # saliences = saliences[I] # X = X[I,:] # labels = [labels[idx] for idx in I] ### REMOVE INPUTS THAT MATCH PREVIOUS UPATES ### I = self.non_update_matching_indices( X, threshold=update_sim_threshold) unicodes = unicodes[I] saliences = saliences[I] X = X[I, :] labels = [labels[idx] for idx in I] ### DEDUP AND COUNT DUPLICATES ### I, counts = self.unique_indices(unicodes, saliences, return_counts=True) unicodes = unicodes[I] saliences = saliences[I] X = X[I, :] labels = [labels[idx] for idx in I] n_points = X.shape[0] print "System time: {} -- {}".format(time_int.start, time_int.stop) print "Received {} unique sentences from {} total".format( n_points, n_points_total) if n_points <= 1: continue if self.use_temp_ is True and self.n_updates_ > 0: saliences_pen = self.penalize_salience(saliences, X, penalty_mode, scale=repulsion) ranks = saliences_pen else: ranks = saliences ### Init Preferences and Similarities ### P = self.compute_preferences(ranks, n_points, counts, scale) A = self.compute_affinities(X, P, counts) af = AffinityPropagation(preference=P, affinity='precomputed', max_iter=1000, damping=.7, verbose=True).fit(A) exemplars = af.cluster_centers_indices_ assignments = exemplars[af.labels_] if saliences_pen is None: ranks = stdsclr.fit_transform(saliences[:, np.newaxis]) else: ranks = stdsclr.fit_transform(saliences_pen[:, np.newaxis]) update_idxs = [e for e in exemplars if ranks[e] > update_cutoff \ and np.where(assignments == e)[0].shape[0] > 1] sorted_idxs = [] for e in exemplars: sorted_idxs.append(e) for m in np.where(assignments == e)[0]: if e != m: sorted_idxs.append(e) self.add_updates(update_idxs, time_int, labels, unicodes, saliences, saliences_pen, X) self.write_iterative_summary( odir, time_int.stop.strftime(u'%Y-%m-%d-%H')) log_system(sorted_idxs, time_int, labels, unicodes, saliences, saliences_pen) for e in update_idxs: if saliences_pen is not None: print saliences[e], saliences_pen[e], print unicodes[e].encode(u'utf-8') else: print saliences[e], unicodes[e].encode(u'utf-8') print self.write_updates(odir)
def run(self, data_reader, odir, penalty_mode, scale=1.0, repulsion=1.0, update_cutoff=1.0, update_sim_threshold=.75): stdsclr = StandardScaler() log_handle, log_system = logger(odir) saliences_pen = None for hour, labels, unicodes, saliences, X, in data_reader: time_int = hour_str2datetime_interval(hour) n_points_total = X.shape[0] # I = self.simple_filter(unicodes) # unicodes = unicodes[I] # saliences = saliences[I] # X = X[I,:] # labels = [labels[idx] for idx in I] ### REMOVE INPUTS THAT MATCH PREVIOUS UPATES ### I = self.non_update_matching_indices( X, threshold=update_sim_threshold) unicodes = unicodes[I] saliences = saliences[I] X = X[I,:] labels = [labels[idx] for idx in I] ### DEDUP AND COUNT DUPLICATES ### I, counts = self.unique_indices( unicodes, saliences, return_counts=True) unicodes = unicodes[I] saliences = saliences[I] X = X[I,:] labels = [labels[idx] for idx in I] n_points = X.shape[0] print "System time: {} -- {}".format( time_int.start, time_int.stop) print "Received {} unique sentences from {} total".format( n_points, n_points_total) if n_points <= 1: continue if self.use_temp_ is True and self.n_updates_ > 0: saliences_pen = self.penalize_salience( saliences, X, penalty_mode, scale=repulsion) ranks = saliences_pen else: ranks = saliences ### Init Preferences and Similarities ### P = self.compute_preferences(ranks, n_points, counts, scale) A = self.compute_affinities(X, P, counts) af = AffinityPropagation( preference=P, affinity='precomputed', max_iter=1000, damping=.7, verbose=True).fit(A) exemplars = af.cluster_centers_indices_ assignments = exemplars[af.labels_] if saliences_pen is None: ranks = stdsclr.fit_transform(saliences[:,np.newaxis]) else: ranks = stdsclr.fit_transform(saliences_pen[:,np.newaxis]) update_idxs = [e for e in exemplars if ranks[e] > update_cutoff \ and np.where(assignments == e)[0].shape[0] > 1] sorted_idxs = [] for e in exemplars: sorted_idxs.append(e) for m in np.where(assignments == e)[0]: if e != m: sorted_idxs.append(e) self.add_updates( update_idxs, time_int, labels, unicodes, saliences, saliences_pen, X) self.write_iterative_summary( odir, time_int.stop.strftime(u'%Y-%m-%d-%H')) log_system( sorted_idxs, time_int, labels, unicodes, saliences, saliences_pen) for e in update_idxs: if saliences_pen is not None: print saliences[e], saliences_pen[e], print unicodes[e].encode(u'utf-8') else: print saliences[e], unicodes[e].encode(u'utf-8') print self.write_updates(odir)