def __wait(self): """Wait on waitable started tasks in this manager.""" for obj in self.__tasks: if obj._state == RUNNING and hasattr(obj, "wait") and obj.wait: assert obj._manager == self with Progress("Waiting on %s" % obj.name): self.__callMeMethod(obj.wait)
class SearchTerm: def __init__(self, filename): words = filter(bool, open(filename, 'r').read().split('\n')) # Record and store progress progress_filename = filename + '.progress' self.progress = Progress(current_file = progress_filename) # Load progress if self.progress.current: self.word_list = words[self.progress.current[0]:] else: self.word_list = words self.counter = 0 def current(self): """ the index of word in word list """ return self.progress.current def get_counter(self): """ times next() has been called, ie. progress of current session. """ return self.counter def next(self): self.progress.next([0], [len(self.word_list)]) self.progress.save() self.counter += 1 def get_word_list(self): return self.word_list def expand_word_list(self, expand_type): if expand_type == "suggest": self.google_suggest() elif expand_type == "search": None else: None def collect(self): None def google_suggest(self): None
def upload(path): file_size = os.stat(path).st_size if file_size <= 5 * 1024 * 1024: url = get_signed_url(path, "PUT") progress = Progress() stream = FileWithCallback(path, 'rb', progress.update, path) response = requestor.session.put(url, data=stream) return response else: return multipart_upload(path)
def __init__(self, filename): words = filter(bool, open(filename, 'r').read().split('\n')) # Record and store progress progress_filename = filename + '.progress' self.progress = Progress(current_file = progress_filename) # Load progress if self.progress.current: self.word_list = words[self.progress.current[0]:] else: self.word_list = words self.counter = 0
def test_Progress(): # multiple values progress = Progress(current_file='../../data/trend/no_file') assert_equal(progress.next([8,0,0,0,0], [10,9,9,9,9]), [9,0,0,0,0]) assert_equal(progress.next([8,0,0,0,0], [10,9,9,9,9]), [8,1,0,0,0]) assert_equal(progress.next([8,0,0,0,0], [10,9,9,9,9]), [9,1,0,0,0]) # single value progress = Progress(current_file='../../data/trend/no_file', start=[0]) assert_equal(progress.next([0], [3]), [1]) assert_equal(progress.next([0], [3]), [2]) assert_equal(progress.next([0], [3]), None) # start from middle progress = Progress(current_file='../../data/trend/no_file', start=[3]) assert_equal(progress.next([0], [6]), [4]) assert_equal(progress.next([0], [6]), [5]) assert_equal(progress.next([0], [6]), None)
def __stop(self): """Stop all running tasks in this manager in reverse of the order they were started in.""" for obj in reversed(self.__tasks): if obj._state == STOPPED: continue assert obj._manager == self if hasattr(obj, "stop") and obj.stop: with Progress("Stopping %s" % obj.name): self.__callMeMethod(obj.stop) obj._state = STOPPED obj._manager = None
def __start(self): """Start all tasks in this manager.""" for obj in self.__tasks: if obj._state == RUNNING: if obj._manager != self: raise ValueError("Task %s is running in another manager" % obj.name) continue if hasattr(obj, "start") and obj.start: with Progress("Starting %s" % obj.name): self.__callMeMethod(obj.start) obj._state = RUNNING obj._manager = self
def sampleAndGetStreetImageLinks(endPoints, sampleNum, picNum, ptrNum, intersectionPointInfo): """ Randomly select end points from the endPoint collection. For each selected end point, call Google map street view image api to get the street view images. :return: """ print "sampling street images..." # get 2x sampled points for skipping some images that are missing its date sampledPoints = random.sample(endPoints, sampleNum) if sampleNum < len(endPoints) * 2 else endPoints sampleData = [] # store (picture number, file name, lat and lng, link to image) progress = Progress(10) headings = CONFIG["gmap"]["headings"] sampleNumDelta = len(headings) for point in sampledPoints: progress.printProgress() result = getSurroundingStreetViewLinks(point, picNum, ptrNum, intersectionPointInfo) sampleData += result picNum += sampleNumDelta ptrNum += 1 print "" return sampleData
def sampleAndDownloadStreetImage(endPoints, sampleNum, picNum, ptrNum, targetDirectory, intersectionPointInfo): """ Randomly select end points from the endPoint collection. For each selected end point, call Google map street view image api to get the street view images. :return: """ print "downloading street images..." sampledPoints = random.sample( endPoints, sampleNum) if sampleNum < len(endPoints) else endPoints sampleData = [] # store (picture number, file name, lat and lng) progress = Progress(10) headings = CONFIG["gmap"]["headings"] sampleNumDelta = len(headings) for point in sampledPoints: progress.printProgress() result = downloadSurroundingStreetView(point, targetDirectory, picNum, ptrNum, intersectionPointInfo) sampleData += result picNum += sampleNumDelta ptrNum += 1 print "" return sampleData
def __reset(self): """Reset all tasks in this manager in their stop order. Tasks are reset regardless of their current state and are always put in the STOPPED state. Exceptions will be printed and ignored.""" for obj in reversed(self.__tasks): assert obj._manager == None or obj._manager == self if hasattr(obj, "reset") and obj.reset: with Progress("Resetting %s" % obj.name): try: self.__callMeMethod(obj.reset) except: sys.excepthook(*sys.exc_info()) obj._state = STOPPED obj._manager = None
def wait_error(): progress = Progress(20) for i in range(21): if i: progress.increment() time.sleep(ERROR_TIME / 21.0) progress.done()
def train(train_story, train_questions, train_qstory, memory, model, loss, general_config): train_config = general_config.train_config dictionary = general_config.dictionary nepochs = general_config.nepochs nhops = general_config.nhops batch_size = general_config.batch_size enable_time = general_config.enable_time randomize_time = general_config.randomize_time lrate_decay_step = general_config.lrate_decay_step train_range = general_config.train_range # indices of training questions val_range = general_config.val_range # indices of validation questions train_len = len(train_range) val_len = len(val_range) params = { "lrate": train_config["init_lrate"], "max_grad_norm": train_config["max_grad_norm"] } for ep in range(nepochs): # Decrease learning rate after every decay step if (ep + 1) % lrate_decay_step == 0: params["lrate"] *= 0.5 total_err = 0. total_cost = 0. total_num = 0 # print train_len # print(train_len, batch_size, int(math.floor(train_len / batch_size))) for _ in Progress(range(int(math.floor(train_len / batch_size)))): # Question batch batch = train_range[np.random.randint(train_len, size=batch_size)] input_data = np.zeros((train_story.shape[0], batch_size), np.float32) # words of training questions target_data = train_questions[2, batch] # indices of training answers memory[0].data[:] = dictionary["nil"] # Compose batch of training data for b in range(batch_size): # NOTE: +1 since train_questions[1, :] is the index of the sentence right before the training question. # d is a batch of [word indices in sentence, sentence indices from batch] for this story d = train_story[:, :(1 + train_questions[1, batch[b]]), train_questions[0, batch[b]]] # Pick a fixed number of latest sentences (before the question) from the story offset = max(0, d.shape[1] - train_config["sz"]) d = d[:, offset:] # Training data for the 1st memory cell memory[0].data[:d.shape[0], :d.shape[1], b] = d if enable_time: # Inject noise into time index (i.e. word index) if randomize_time > 0: # Random number of blank (must be < total sentences until the training question?) nblank = np.random.randint( int(math.ceil(d.shape[1] * randomize_time))) rt = np.random.permutation(d.shape[1] + nblank) rt[rt >= train_config["sz"]] = train_config[ "sz"] - 1 # put the cap # Add random time (must be > dictionary's length) into the time word (decreasing order) memory[0].data[-1, :d.shape[1], b] = np.sort( rt[:d.shape[1]])[::-1] + len(dictionary) else: memory[0].data[-1, :d.shape[1], b] = \ np.arange(d.shape[1])[::-1] + len(dictionary) input_data[:, b] = train_qstory[:, batch[b]] for i in range(1, nhops): memory[i].data = memory[0].data out = model.fprop(input_data) total_cost += loss.fprop(out, target_data) total_err += loss.get_error(out, target_data) total_num += batch_size grad = loss.bprop(out, target_data) model.bprop(input_data, grad) model.update(params) for i in range(nhops): memory[i].emb_query.weight.D[:, 0] = 0 # Validation total_val_err = 0. total_val_cost = 0. total_val_num = 0 for k in range(int(math.floor(val_len / batch_size))): batch = val_range[np.arange(k * batch_size, (k + 1) * batch_size)] input_data = np.zeros((train_story.shape[0], batch_size), np.float32) target_data = train_questions[2, batch] memory[0].data[:] = dictionary["nil"] for b in range(batch_size): d = train_story[:, :(1 + train_questions[1, batch[b]]), train_questions[0, batch[b]]] offset = max(0, d.shape[1] - train_config["sz"]) d = d[:, offset:] # Data for the 1st memory cell memory[0].data[:d.shape[0], :d.shape[1], b] = d if enable_time: memory[0].data[-1, :d.shape[1], b] = np.arange( d.shape[1])[::-1] + len(dictionary) input_data[:, b] = train_qstory[:, batch[b]] for i in range(1, nhops): memory[i].data = memory[0].data out = model.fprop(input_data) total_val_cost += loss.fprop(out, target_data) total_val_err += loss.get_error(out, target_data) total_val_num += batch_size train_error = total_err / total_num val_error = total_val_err / total_val_num print("%d | train error: %g | val error: %g" % (ep + 1, train_error, val_error))
def threshold_table(start, stop, reading_channels, channels, bands, label='kmeans-labels', filename=DEFAULT_FILENAME, prefix='.'): """ Makes a html table of 'percent increase' from the largest cluster by band and channel. """ data = TimeSeriesDict.read(filename, reading_channels + [label], start=to_gps(start), end=to_gps(stop)) labels = data[label] clusters = list(range(max(labels.value) + 1)) cluster_counts = list( len(labels.value[labels.value == c]) for c in clusters) largest_cluster = cluster_counts.index(max(cluster_counts)) clusters.remove(largest_cluster) logger.info( f'Largest cluster found to be Nº{largest_cluster} ({100 * max(cluster_counts) // len(labels.value)}%). Doing {clusters}.' ) cluster_counts.remove(max(cluster_counts)) def amplitude(channel, cluster): """return median amplitude for channel in cluster.""" try: chan = data[channel] except KeyError: return 0.0 return median([ chan.value[i] for i, c in enumerate(labels.value) if c == cluster ]) def threshold(cluster, channel, band) -> str: f_channel = f'{channel}_BLRMS_{band}.mean' base = amplitude(f_channel, largest_cluster) if base != 0.0: return str(int( 100 * (amplitude(f_channel, cluster) - base) / base)) + '%' else: return str(amplitude(f_channel, cluster)) range_chan = 'L1:DMT-SNSH_EFFECTIVE_RANGE_MPC.mean' if range_chan in reading_channels: base_range = amplitude(range_chan, largest_cluster) if base_range != 0.0: snsh = lambda c: 'SNSH: ' + str( int(100 * (amplitude(range_chan, c) - base_range) / base_range) ) + '%' else: snsh = lambda c: 'SNSH: 0.0' else: snsh = lambda c: '' with Progress('taking thresholds', len(clusters)) as progress: for i, cluster in enumerate(clusters): buffer = [[''] + bands] for channel in channels: buffer.append([channel] + [ progress(threshold, i, cluster, channel, band) for band in bands ]) html_table( f'cluster {cluster} ({colors[cluster]}) {snsh(cluster)}', csv_writer(buffer, get_path(f'{cluster}', 'csv', prefix=prefix)), get_path(f'{cluster}', 'html', prefix=prefix)) html_table( 'Index', csv_writer( [['clusters:']] + [[f'<a href="{cluster}.html">Nº{cluster} ({colors[cluster]})</a>'] for cluster in clusters], get_path('idx', 'csv', prefix=prefix)), get_path('index', 'html', prefix=prefix))
def compute_kmeans(channels, start, stop, history=timedelta(hours=2), filename=DEFAULT_FILENAME, downloader=TimeSeriesDict.get, **kwargs): """ Computes k-means clusters and saves the data and labels to filename. **kwargs are forwarded to the KMeans constructor. >>> from gwpy.time import tconvert, from_gps >>> from datetime import timedelta >>> from cluster import compute_kmeans >>> >>> channels = [f'L1:ISI-GND_STS_ETMX_Z_BLRMS_1_3.mean,m-trend', 'L1:ISI-GND_STS_ETMY_Z_BLRMS_1_3.mean,m-trend'] >>> >>> stop = from_gps(60 * (int(tconvert('now')) // 60)) # gets nearest minute to now >>> start = stop - timedelta(days=1) # cluster the past day >>> compute_kmeans(channels, start, stop, filename='my_kmeans.hdf5', n_clusters=5, random_state=0) """ # set up duration (minute-trend data has dt=1min, so reject intervals not on the minute). duration = (stop - start).total_seconds() / 60 assert (stop - start).total_seconds() / 60 == (stop - start).total_seconds() // 60 duration = int(duration) logger.info( f'Clustering data from {start} to {stop} ({duration} minutes).') # download data using TimeSeries.get(), including history of point at t0. logger.debug( f'Initiating download from {start} to {stop} with history={history}...' ) dl = downloader(channels, start=to_gps(start - history), end=to_gps(stop)) logger.info(f'Downloaded from {start} to {stop} with history={history}.') # generate input matrix of the form [sample1;...;sampleN] with sampleK = [feature1,...,featureN] # for sklearn.cluster algorithms. This is the slow part of the function, so a progress bar is shown. logger.debug(f'Initiating input matrix generation...') with Progress('building input', (duration * 60)) as progress: input_data = stack([ concatenate([ progress(dl[channel].crop, t, start=to_gps(start + timedelta(seconds=t) - history), end=to_gps(start + timedelta(seconds=t))).value for channel in channels ]) for t in range(0, int(duration * 60), 60) ]) # verify input matrix dimensions. assert input_data.shape == (duration, int( len(channels) * history.total_seconds() / 60)) logger.info('Completed input matrix generation.') # actually do the fit. logger.debug(f'Initiating KMeans({kwargs}) fit...') kmeans = KMeans(**kwargs).fit(input_data) logger.info(f'Completed KMeans({kwargs}) fit.') # cast the output labels to a TimeSeries so that cropping is easy later on. labels = TimeSeries(kmeans.labels_, times=dl[channels[0]].crop(start=to_gps(start), end=to_gps(stop)).times, name='kmeans-labels') # put labels in data download dictionary for easy saving. dl[labels.name] = labels # write data download and labels to specified filename. cache_file = abspath(filename) if exists(cache_file): remove(cache_file) dl.write(cache_file) logger.info(f'Wrote cache to {filename}')
def cluster_plotter(channels, start, stop, prefix='.', label='kmeans-labels', groups=None, filename=DEFAULT_FILENAME, dqflag='L1:DMT-ANALYSIS_READY:1', xscale=None, unit=None, progressbar=True, **kwargs): """ Plots data with clusters labeled by color in the working directory, or a relative path given by prefix. Requires a .hdf5 file produced with a clustering function defined in this module to be in the working directory. **kwargs are forwarded to TimeSeries.plot(). :param prefix: relative path to output images. :param label: name attribute of labels TimeSeries saved in filename. :param groups: groups of channels to plot in the same figure. See the example. :param dqflag: data quality flag for segments bar. :param xscale: gps x-axis scale to use. :param unit: override y-axis unit. :param progressbar: show progress bar. >>> from gwpy.time import tconvert, from_gps >>> from datetime import timedelta >>> from cluster import cluster_plotter >>> >>> channels = [f'L1:ISI-GND_STS_ETMX_Z_BLRMS_1_3.mean,m-trend', 'L1:ISI-GND_STS_ETMY_Z_BLRMS_1_3.mean,m-trend'] >>> groups = [[channels, ('ETMX', 'ETMY'), 'L1:ISI-GND_STS_BLRMS_1_3 Z-axis']] # plot on the same figure. >>> >>> stop = from_gps(60 * (int(tconvert('now')) // 60)) # gets nearest minute to now >>> start = stop - timedelta(days=1) # cluster the past day >>> cluster_plotter(channels, start, stop, filename='my_kmeans.hdf5', groups=groups) """ # some defaults. if not kwargs: kwargs['color'] = 'k' kwargs['alpha'] = 0.3 if groups is None: groups = channels # read the data from the save file. data = TimeSeriesDict.read(filename, channels + [label], start=to_gps(start), end=to_gps(stop)) logger.info(f'Read {start} to {stop} from {filename}') # get segments for the duration specified. Note that this may require doing `ligo-proxy-init -p`. logger.debug(f'Getting segments for {dqflag} from {start} to {stop}...') dq = DataQualityFlag.query(dqflag, to_gps(start), to_gps(stop)) logger.info(f'Got segments for {dqflag} from {start} to {stop}.') # plotting is slow, so show a nice progress bar. logger.debug('Initiating plotting routine...') with Progress('plotting', len(channels), quiet=not progressbar) as progress: for p, (group, labels, title) in enumerate(groups): # plot the group in one figure. plt = Plot(*(data[channel] for channel in group), separate=True, sharex=True, zorder=1, **kwargs) # modify the axes one by one. axes = plt.get_axes() for i, ax in enumerate(axes): # namely, add a colored overlay that indicates clustering labels. ax.scatter(data[group[i]].times, data[group[i]].value, c=[colors[j] for j in data[label]], edgecolor='', s=4, zorder=2) ax.set_ylabel( f'{labels[i]} {data[group[i]].unit if unit is None else unit}' ) setp(ax.get_xticklabels(), visible=False) # modify the figure as a whole. plt.add_segments_bar(dq, label='') if xscale is not None: plt.gca().set_xscale(xscale) plt.suptitle(title) # save to png. progress(plt.save, p, get_path(title, 'png', prefix=prefix)) logger.info(f'Completed plotting for {start} to {stop} from {filename}')
def train(train_story, train_questions, train_qstory, memory, model, loss_function, general_config, USE_CUDA=False): FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor train_config = general_config.train_config dictionary = general_config.dictionary nepochs = general_config.nepochs nhops = general_config.nhops batch_size = general_config.batch_size enable_time = general_config.enable_time randomize_time = general_config.randomize_time lrate_decay_step = general_config.lrate_decay_step train_range = general_config.train_range # indices of training questions val_range = general_config.val_range # indices of validation questions train_len = len(train_range) val_len = len(val_range) params = { "lrate": train_config["init_lrate"], "max_grad_norm": train_config["max_grad_norm"] } optimizer = optim.SGD(model.parameters(), lr=params["lrate"]) for ep in range(nepochs): # Decrease learning rate after every decay step if (ep + 1) % lrate_decay_step == 0: params["lrate"] *= 0.5 for param_group in optimizer.param_groups: param_group['lr'] = params["lrate"] total_err = 0. total_cost = 0. total_num = 0 for _ in Progress(range(int(math.floor(train_len / batch_size)))): # Question batch # batch = train_range[np.random.randint(train_len, size=batch_size)] batch = train_range[torch.randint(train_len, size=(batch_size, ))] #batch = train_range input_data = Variable( torch.zeros((train_story.shape[0], batch_size), dtype=torch.float32)) input_data.requires_grad = False target_data = Variable(train_questions[2, batch]) with torch.no_grad(): memory[0].data[:] = dictionary["nil"] # Compose batch of training data with torch.no_grad(): for b in range(batch_size): # NOTE: +1 since train_questions[1, :] is the index of the sentence right before the training question. d = train_story[:, :(1 + train_questions[1, batch[b]]), train_questions[0, batch[b]]].detach() # Pick a fixed number of latest sentences (before the question) from the story offset = max(0, d.shape[1] - train_config["sz"]) d = d[:, offset:].detach() # Training data for the 1st memory cell memory[0].data[:d.shape[0], :d.shape[1], b] = d.detach() if enable_time: # Inject noise into time index (i.e. word index) if randomize_time > 0: # Random number of blank (must be < total sentences until the training question?) nblank = np.random.randint( int(math.ceil(d.shape[1] * randomize_time))) rt = np.random.permutation(d.shape[1] + nblank) rt[rt >= train_config["sz"]] = train_config[ "sz"] - 1 # put the cap # Add random time (must be > dictionary's length) into the time word (decreasing order) nparray = np.sort(rt[:d.shape[1]])[::-1] + len( dictionary, ) with torch.no_grad(): memory[0].data[-1, :d.shape[1], b] = torch.from_numpy( nparray).detach() else: memory[0].data[ -1, :d.shape[1], b] = torch.from_numpy( np.arange(d.data.numpy().shape[1])[::-1] + len(dictionary)) input_data[:, b] = train_qstory[:, batch[b]].detach() for i in range(1, nhops): with torch.no_grad(): memory[i].data = memory[0].data #input_data.requires_grad_() model.zero_grad() for i in memory: memory[i].zero_grad() memory[i].mod_out.zero_grad() memory[i].mod_query.zero_grad() optimizer.zero_grad() out = model(input_data) loss = loss_function(out.view(out.shape[1], -1), target_data) total_cost += loss.item() y = out.max(0)[1] # y = out.argmax(axis=0) total_err += torch.sum( y != target_data) # total_err += np.sum(y != target_data) total_num += batch_size loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), params["max_grad_norm"], norm_type=2) optimizer.step() with torch.no_grad(): for i in range(nhops): memory[i].emb_query.weight[:, 0] = 0 # Validation total_val_err = 0. total_val_cost = 0. total_val_num = 0 #input_data.requires_grad_() for k in range(int(math.floor(val_len / batch_size))): batch = val_range[torch.arange( k * batch_size, (k + 1) * batch_size )] # val_range[np.arange(k * batch_size, (k + 1) * batch_size)] input_data = torch.zeros( (train_story.shape[0], batch_size), dtype=torch.float32 ) # input_data = np.zeros((train_story.shape[0], batch_size), np.float32) target_data = train_questions[2, batch] memory[0].data[:] = dictionary["nil"] for b in range(batch_size): d = train_story[:, :(1 + train_questions[1, batch[b]]), train_questions[0, batch[b]]] offset = max(0, d.shape[1] - train_config["sz"]) d = d[:, offset:] # Data for the 1st memory cell memory[0].data[:d.shape[0], :d.shape[1], b] = d if enable_time: tensor = torch.arange(d.shape[1]) idx = [i for i in range(tensor.size(0) - 1, -1, -1)] idx = torch.LongTensor(idx) inverted_tensor = tensor.index_select( 0, idx) + len(dictionary) memory[0].data[ -1, :d.shape[1], b] = inverted_tensor # np.arange(d.shape[1])[::-1] + len(dictionary) input_data[:, b] = train_qstory[:, batch[b]] for i in range(1, nhops): memory[i].data = memory[0].data out = model(input_data) loss = loss_function(out.view(out.shape[1], -1), target_data) total_val_cost += loss.item() y = out.max(0)[1] # y = out.argmax(axis=0) total_val_err += torch.sum( y != target_data) # total_err += np.sum(y != target_data) total_val_num += batch_size train_error = total_err.float() / total_num val_error = total_val_err.float() / total_val_num print("%d | train error: %g | val error: %g" % (ep + 1, train_error, val_error))
def representative_spectra(channels, start, stop, rate, label='kmeans-labels', filename=DEFAULT_FILENAME, prefix='.', downloader=TimeSeriesDict.get, cluster_numbers=None, groups=None, **kwargs): """ Make representative spectra for each cluster based on the median psd for minutes in that cluster. Downloads only the raw minutes in the cluster to save. """ if groups is None: groups = channels # read the labels from the save file. labels = TimeSeries.read(filename, label, start=to_gps(start), end=to_gps(stop)) logger.info(f'Read labels {start} to {stop} from {filename}') if cluster_numbers is None: clusters = list(range(max(labels.value) + 1)) cluster_counts = list( len(labels.value[labels.value == c]) for c in clusters) largest_cluster = cluster_counts.index(max(cluster_counts)) clusters.remove(largest_cluster) logger.info( f'Largest cluster found to be Nº{largest_cluster} ({100 * max(cluster_counts) // len(labels.value)}%). Doing {clusters}.' ) cluster_counts.remove(max(cluster_counts)) else: clusters = cluster_numbers cluster_counts = list( len(labels.value[labels.value == c]) for c in clusters) t, v, d = labels.times, labels.value, diff(labels.value) pairs = list( zip([t[0]] + list(t[:-1][d != 0]), list(t[1:][d != 0]) + [t[-1]])) values = list(v[:-1][d != 0]) + [v[-1]] assert len(pairs) == len(values) # need to include start-| and |-end # l|r l|r l|r l|r # l,r l,r l,r l,r # l r,l r,l r,l r # zip(start + l[1:], r[:-1] + stop) print(pairs) for pair in pairs: print(int(pair[1].value) - int(pair[0].value)) print(values) # use h5py to make a mutable object pointing to a file on disk. save_file, filename = path2h5file( get_path(f'spectra-cache {start}', 'hdf5', prefix=prefix)) logger.debug(f'Initiated hdf5 stream to {filename}') logger.info(f'Patching {filename}...') for i, (dl_start, end) in enumerate(pairs): if values[i] in clusters: if not data_exists(channels, to_gps(end).seconds, save_file): logger.debug( f'Downloading Nº{values[i]} from {dl_start} to {end}...') try: dl = downloader(channels, start=to_gps(dl_start) - LIGOTimeGPS(60), end=to_gps(end) + LIGOTimeGPS(seconds=1)) out = TimeSeriesDict() for n in dl: out[n] = dl[n].resample(**better_aa_opts(dl[n], rate)) write_to_disk(out, to_gps(dl_start).seconds, save_file) except RuntimeError: # Cannot find all relevant data on any known server logger.warning( f"SKIPPING Nº{values[i]} from {dl_start} to {end} !!") logger.info('Reading data...') data = TimeSeriesDict.read(save_file, channels) logger.info('Starting PSD generation...') f = data[channels[0]].crop( start=to_gps(data[channels[0]].times[-1]) - LIGOTimeGPS(60), end=to_gps(data[channels[0]].times[-1])).psd().frequencies d = (to_gps(labels.times[-1]).seconds - to_gps(labels.times[1]).seconds) for i, cluster in enumerate(clusters): try: psds = { channel: FrequencySeries.read(filename, f'{cluster}-{channel}') for channel in channels } logger.info(f'Loaded Nº{cluster}.') except KeyError: logger.info( f'Doing Nº{cluster} ({100 * cluster_counts[i] / len(labels.value):.2f}% of data)...' ) with Progress(f'psd Nº{cluster} ({i + 1}/{len(clusters)})', len(channels) * d) as progress: psds = { channel: FrequencySeries(median(stack([ progress(data[channel].crop, pc * d + (to_gps(time).seconds - to_gps(labels.times[1]).seconds), start=to_gps(time) - LIGOTimeGPS(60), end=to_gps(time)).psd().value for c, time in zip(labels.value, labels.times) if c == cluster ]), axis=0), frequencies=f, name=f'{cluster}-{channel}') for pc, channel in enumerate(channels) } for name in psds.keys(): psds[name].write(filename, **writing_opts) # plotting is slow, so show a nice progress bar. logger.debug('Initiating plotting routine...') with Progress('plotting', len(groups)) as progress: for p, (group, lbls, title) in enumerate(groups): # plot the group in one figure. plt = Plot(*(psds[channel] for channel in group), separate=False, sharex=True, zorder=1, **kwargs) # plt.gca().set_xlim((30,60)) # modify the figure as a whole. # plt.add_segments_bar(dq, label='') plt.gca().set_xscale('log') plt.gca().set_yscale('log') plt.suptitle(title) plt.legend(lbls) # save to png. progress( plt.save, p, get_path(f'{cluster}-{title}', 'png', prefix=f'{prefix}/{cluster}'))
def train(train_story, train_questions, train_qstory, memory, model, loss, general_config, train_logger, val_logger, global_batch_iter=0, best_val_cost=1000000., best_val_err=1000000.): train_config = general_config.train_config dictionary = general_config.dictionary nepochs = general_config.nepochs nhops = general_config.nhops batch_size = general_config.batch_size enable_time = general_config.enable_time randomize_time = general_config.randomize_time lrate_decay_step = general_config.lrate_decay_step train_range = general_config.train_range # indices of training questions val_range = general_config.val_range # indices of validation questions train_len = len(train_range) val_len = len(val_range) display_inteval = general_config.display_inteval params = { "lrate": train_config["init_lrate"], "max_grad_norm": train_config["max_grad_norm"] } if randomize_time > 0: print('We use Random Noise (RN) ratio of %.1f' % randomize_time) # train/val start for ep in range(nepochs): # Decrease learning rate after every decay step if (ep + 1) % lrate_decay_step == 0: params["lrate"] *= 0.5 total_err = 0. total_cost = 0. total_num = 0 batch_iter = 0 for _ in Progress(range(int(math.floor(train_len / batch_size)))): # Question batch batch = train_range[np.random.randint(train_len, size=batch_size)] input_data = np.zeros((train_story.shape[0], batch_size), np.float32) # words of training questions target_data = train_questions[2, batch] # indices of training answers memory[0].data[:] = dictionary["nil"] # Compose batch of training data for b in range(batch_size): # NOTE: +1 since train_questions[1, :] is the index of the sentence right before the training question. # d is a batch of [word indices in sentence, sentence indices from batch] for this story d = train_story[:, :(1 + train_questions[1, batch[b]]), train_questions[0, batch[b]]] # Pick a fixed number of latest sentences (before the question) from the story offset = max(0, d.shape[1] - train_config["sz"]) d = d[:, offset:] # Training data for the 1st memory cell memory[0].data[:d.shape[0], :d.shape[1], b] = d if enable_time: # Inject noise into time index (i.e. word index) if randomize_time > 0: # Random number of blank (must be < total sentences until the training question?) nblank = np.random.randint( int(math.ceil(d.shape[1] * randomize_time))) rt = np.random.permutation(d.shape[1] + nblank) rt[rt >= train_config["sz"]] = train_config[ "sz"] - 1 # put the cap # Add random time (must be > dictionary's length) into the time word (decreasing order) memory[0].data[-1, :d.shape[1], b] = np.sort( rt[:d.shape[1]])[::-1] + len(dictionary) else: memory[0].data[-1, :d.shape[1], b] = \ np.arange(d.shape[1])[::-1] + len(dictionary) input_data[:, b] = train_qstory[:, batch[b]] for i in range(1, nhops): memory[i].data = memory[0].data out = model.fprop(input_data) cost = loss.fprop(out, target_data) err = loss.get_error(out, target_data) total_cost += cost total_err += err total_num += batch_size grad = loss.bprop(out, target_data) model.bprop(input_data, grad) model.update(params) batch_iter += 1 global_batch_iter += 1 if batch_iter % display_inteval == 0: print("%d | %d | %g | loss: %g | err: %g" % (ep, global_batch_iter, params['lrate'], cost / batch_size, err / batch_size)) sys.stdout.flush() train_logger.write('%d %d %f %f %f\n' % (ep, global_batch_iter, params['lrate'], cost / batch_size, err / batch_size)) train_logger.flush() for i in range(nhops): memory[i].emb_query.weight.D[:, 0] = 0 # Validation total_val_err = 0. total_val_cost = 0. total_val_num = 0 for k in range(int(math.floor(val_len / batch_size))): batch = val_range[np.arange(k * batch_size, (k + 1) * batch_size)] input_data = np.zeros((train_story.shape[0], batch_size), np.float32) target_data = train_questions[2, batch] memory[0].data[:] = dictionary["nil"] for b in range(batch_size): d = train_story[:, :(1 + train_questions[1, batch[b]]), train_questions[0, batch[b]]] offset = max(0, d.shape[1] - train_config["sz"]) d = d[:, offset:] # Data for the 1st memory cell memory[0].data[:d.shape[0], :d.shape[1], b] = d if enable_time: memory[0].data[-1, :d.shape[1], b] = np.arange( d.shape[1])[::-1] + len(dictionary) input_data[:, b] = train_qstory[:, batch[b]] for i in range(1, nhops): memory[i].data = memory[0].data out = model.fprop(input_data) val_cost = loss.fprop(out, target_data) val_err = loss.get_error(out, target_data) total_val_cost += val_cost total_val_err += val_err total_val_num += batch_size current_val_cost = total_val_cost / total_val_num current_val_err = total_val_err / total_val_num print("%d | %d | val loss: %g | val err: %g" % (ep, global_batch_iter, current_val_cost, current_val_err)) sys.stdout.flush() if best_val_cost > current_val_cost: print('current: %f, best: %f' % (current_val_cost, best_val_cost)) best_model = model best_memory = memory best_val_cost = current_val_cost best_val_err = current_val_err print('Best val loss: %f, val err: %f' % (best_val_cost, best_val_err)) sys.stdout.flush() train_error = total_err / total_num val_error = total_val_err / total_val_num val_logger.write('%d %d %f %f %f\n' % \ (ep, global_batch_iter, params['lrate'], current_val_cost, current_val_err)) val_logger.flush() return train_logger,\ val_logger,\ best_model,\ best_memory,\ global_batch_iter,\ best_val_cost,\ best_val_err
def compute_all(channels, start, stop, history=timedelta(hours=2), filename=DEFAULT_FILENAME, **kwargs): # set up duration (minute-trend data has dt=1min, so reject intervals not on the minute). duration = (stop - start).total_seconds() / 60 assert (stop - start).total_seconds() / 60 == (stop - start).total_seconds() // 60 duration = int(duration) logger.info( f'Clustering data from {start} to {stop} ({duration} minutes).') # download data using TimeSeries.get(), including history of point at t0. logger.debug( f'Initiating download from {start} to {stop} with history={history}...' ) dl = TimeSeriesDict.get(channels, start=to_gps(start - history), end=to_gps(stop)) logger.info(f'Downloaded from {start} to {stop} with history={history}.') if exists('input.npy'): input_data = np.load('input.npy') logger.info('Loaded input matrix.') else: # generate input matrix of the form [sample1;...;sampleN] with sampleK = [feature1,...,featureN] # for sklearn.cluster algorithms. This is the slow part of the function, so a progress bar is shown. logger.debug(f'Initiating input matrix generation...') with Progress('building input', (duration * 60)) as progress: input_data = stack([ concatenate([ progress(dl[channel].crop, t, start=to_gps(start + timedelta(seconds=t) - history), end=to_gps(start + timedelta(seconds=t))).value for channel in channels ]) for t in range(0, int(duration * 60), 60) ]) # verify input matrix dimensions. assert input_data.shape == (duration, int( len(channels) * history.total_seconds() / 60)) np.save('input.npy', input_data) logger.info('Completed input matrix generation.') params = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 15, 'min_samples': 20, 'xi': 0.05, 'min_cluster_size': 0.1 } if exists('X.npy'): X = np.load('X.npy') logger.info('Loaded X') else: # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(input_data) np.save('X.npy', X) logger.info('Generated X') if exists('bandwidth.npy'): bandwidth = np.load('bandwidth.npy') logger.info('Loaded bandwidth') else: # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile']) np.save('bandwidth.npy', bandwidth) logger.info('Generated bandwidth') if exists('connectivity.npy'): connectivity = np.load('connectivity.npy', allow_pickle=True) logger.info('Loaded connectivity') else: # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=params['n_neighbors'], include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) np.save('connectivity.npy', connectivity) logger.info('Generated connectivity') ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity) spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") dbscan = cluster.DBSCAN(eps=params['eps']) optics = cluster.OPTICS(min_samples=params['min_samples'], xi=params['xi'], min_cluster_size=params['min_cluster_size']) affinity_propagation = cluster.AffinityPropagation( damping=params['damping'], preference=params['preference']) average_linkage = cluster.AgglomerativeClustering( linkage="average", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity) birch = cluster.Birch(n_clusters=params['n_clusters']) gmm = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full') clustering_algorithms = ( ('MiniBatchKMeans', two_means), ('AffinityPropagation', affinity_propagation), ('MeanShift', ms), ('SpectralClustering', spectral), ('DBSCAN', dbscan), ('OPTICS', optics), ('Birch', birch), ('GaussianMixture', gmm) # ('Ward', ward), # ('AgglomerativeClustering', average_linkage), ) for name, algorithm in clustering_algorithms: if exists(f'part-{name}-{filename}'): labels = TimeSeries.read(f'part-{name}-{filename}', f'{name}-labels') logger.debug(f'LOADED {name}.') else: logger.debug(f'doing {name}...') # catch warnings related to kneighbors_graph with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message="the number of connected components of the " + "connectivity matrix is [0-9]{1,2}" + " > 1. Completing it to avoid stopping the tree early.", category=UserWarning) warnings.filterwarnings( "ignore", message="Graph is not fully connected, spectral embedding" + " may not work as expected.", category=UserWarning) algorithm.fit(X) if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(X) # cast the output labels to a TimeSeries so that cropping is easy later on. labels = TimeSeries( y_pred, times=dl[channels[0]].crop(start=to_gps(start), end=to_gps(stop)).times, name=f'{name}-labels') labels.write(f'part-{name}-{filename}') # put labels in data download dictionary for easy saving. dl[labels.name] = labels # write data download and labels to specified filename. cache_file = abspath(filename) if exists(cache_file): remove(cache_file) dl.write(cache_file) logger.info(f'Wrote cache to {filename}')