def __wait(self):
        """Wait on waitable started tasks in this manager."""

        for obj in self.__tasks:
            if obj._state == RUNNING and hasattr(obj, "wait") and obj.wait:
                assert obj._manager == self
                with Progress("Waiting on %s" % obj.name):
                    self.__callMeMethod(obj.wait)
class SearchTerm:
	def __init__(self, filename):
		words = filter(bool, open(filename, 'r').read().split('\n'))
		# Record and store progress
		progress_filename = filename + '.progress'
		self.progress = Progress(current_file = progress_filename)
		# Load progress
		if self.progress.current:
			self.word_list = words[self.progress.current[0]:]
		else:
			self.word_list = words
		self.counter = 0
	
	def current(self):
		"""
		the index of word in word list
		"""
		return self.progress.current
	
	def get_counter(self):
		"""
		times next() has been called, ie. progress of current session.
		"""
		return self.counter

	def next(self):
		self.progress.next([0], [len(self.word_list)])
		self.progress.save()
		self.counter += 1
	
	def get_word_list(self):
		return self.word_list
	
	def expand_word_list(self, expand_type):
		if expand_type == "suggest":
			self.google_suggest()	
		elif expand_type == "search":
			None
		else:
			None

	def collect(self):
		None

	def google_suggest(self):
		None
Exemple #3
0
def upload(path):
    file_size = os.stat(path).st_size
    if file_size <= 5 * 1024 * 1024:
        url = get_signed_url(path, "PUT")
        progress = Progress()
        stream = FileWithCallback(path, 'rb', progress.update, path)
        response = requestor.session.put(url, data=stream)
        return response
    else:
        return multipart_upload(path)
	def __init__(self, filename):
		words = filter(bool, open(filename, 'r').read().split('\n'))
		# Record and store progress
		progress_filename = filename + '.progress'
		self.progress = Progress(current_file = progress_filename)
		# Load progress
		if self.progress.current:
			self.word_list = words[self.progress.current[0]:]
		else:
			self.word_list = words
		self.counter = 0
def test_Progress():
	# multiple values
	progress = Progress(current_file='../../data/trend/no_file')
	assert_equal(progress.next([8,0,0,0,0], [10,9,9,9,9]), [9,0,0,0,0])
	assert_equal(progress.next([8,0,0,0,0], [10,9,9,9,9]), [8,1,0,0,0])
	assert_equal(progress.next([8,0,0,0,0], [10,9,9,9,9]), [9,1,0,0,0])

	# single value
	progress = Progress(current_file='../../data/trend/no_file', start=[0])
	assert_equal(progress.next([0], [3]), [1])
	assert_equal(progress.next([0], [3]), [2])
	assert_equal(progress.next([0], [3]), None)

	# start from middle
	progress = Progress(current_file='../../data/trend/no_file', start=[3])
	assert_equal(progress.next([0], [6]), [4])
	assert_equal(progress.next([0], [6]), [5])
	assert_equal(progress.next([0], [6]), None)
    def __stop(self):
        """Stop all running tasks in this manager in reverse of the
        order they were started in."""

        for obj in reversed(self.__tasks):
            if obj._state == STOPPED:
                continue
            assert obj._manager == self
            if hasattr(obj, "stop") and obj.stop:
                with Progress("Stopping %s" % obj.name):
                    self.__callMeMethod(obj.stop)
            obj._state = STOPPED
            obj._manager = None
    def __start(self):
        """Start all tasks in this manager."""

        for obj in self.__tasks:
            if obj._state == RUNNING:
                if obj._manager != self:
                    raise ValueError("Task %s is running in another manager" %
                                     obj.name)
                continue
            if hasattr(obj, "start") and obj.start:
                with Progress("Starting %s" % obj.name):
                    self.__callMeMethod(obj.start)
            obj._state = RUNNING
            obj._manager = self
def sampleAndGetStreetImageLinks(endPoints, sampleNum, picNum, ptrNum, intersectionPointInfo):
    """
    Randomly select end points from the endPoint collection.
    For each selected end point, call Google map street view image api
    to get the street view images.
    :return:
    """
    print "sampling street images..."

    # get 2x sampled points for skipping some images that are missing its date
    sampledPoints = random.sample(endPoints, sampleNum) if sampleNum < len(endPoints) * 2 else endPoints
    sampleData = []  # store (picture number, file name, lat and lng, link to image)
    progress = Progress(10)
    headings = CONFIG["gmap"]["headings"]
    sampleNumDelta = len(headings)
    for point in sampledPoints:
        progress.printProgress()
        result = getSurroundingStreetViewLinks(point, picNum, ptrNum, intersectionPointInfo)
        sampleData += result
        picNum += sampleNumDelta
        ptrNum += 1
    print ""
    return sampleData
Exemple #9
0
def sampleAndDownloadStreetImage(endPoints, sampleNum, picNum, ptrNum,
                                 targetDirectory, intersectionPointInfo):
    """
    Randomly select end points from the endPoint collection.
    For each selected end point, call Google map street view image api
    to get the street view images.
    :return:
    """
    print "downloading street images..."
    sampledPoints = random.sample(
        endPoints, sampleNum) if sampleNum < len(endPoints) else endPoints
    sampleData = []  # store (picture number, file name, lat and lng)
    progress = Progress(10)
    headings = CONFIG["gmap"]["headings"]
    sampleNumDelta = len(headings)
    for point in sampledPoints:
        progress.printProgress()
        result = downloadSurroundingStreetView(point, targetDirectory, picNum,
                                               ptrNum, intersectionPointInfo)
        sampleData += result
        picNum += sampleNumDelta
        ptrNum += 1
    print ""
    return sampleData
    def __reset(self):
        """Reset all tasks in this manager in their stop order.  Tasks
        are reset regardless of their current state and are always put
        in the STOPPED state.  Exceptions will be printed and
        ignored."""

        for obj in reversed(self.__tasks):
            assert obj._manager == None or obj._manager == self
            if hasattr(obj, "reset") and obj.reset:
                with Progress("Resetting %s" % obj.name):
                    try:
                        self.__callMeMethod(obj.reset)
                    except:
                        sys.excepthook(*sys.exc_info())
            obj._state = STOPPED
            obj._manager = None
Exemple #11
0
def wait_error():
    progress = Progress(20)
    for i in range(21):
        if i: progress.increment()
        time.sleep(ERROR_TIME / 21.0)
    progress.done()
def train(train_story, train_questions, train_qstory, memory, model, loss,
          general_config):

    train_config = general_config.train_config
    dictionary = general_config.dictionary
    nepochs = general_config.nepochs
    nhops = general_config.nhops
    batch_size = general_config.batch_size
    enable_time = general_config.enable_time
    randomize_time = general_config.randomize_time
    lrate_decay_step = general_config.lrate_decay_step

    train_range = general_config.train_range  # indices of training questions
    val_range = general_config.val_range  # indices of validation questions
    train_len = len(train_range)
    val_len = len(val_range)

    params = {
        "lrate": train_config["init_lrate"],
        "max_grad_norm": train_config["max_grad_norm"]
    }

    for ep in range(nepochs):
        # Decrease learning rate after every decay step
        if (ep + 1) % lrate_decay_step == 0:
            params["lrate"] *= 0.5

        total_err = 0.
        total_cost = 0.
        total_num = 0
        # print train_len
        # print(train_len, batch_size, int(math.floor(train_len / batch_size)))
        for _ in Progress(range(int(math.floor(train_len / batch_size)))):
            # Question batch
            batch = train_range[np.random.randint(train_len, size=batch_size)]

            input_data = np.zeros((train_story.shape[0], batch_size),
                                  np.float32)  # words of training questions
            target_data = train_questions[2,
                                          batch]  # indices of training answers

            memory[0].data[:] = dictionary["nil"]

            # Compose batch of training data
            for b in range(batch_size):
                # NOTE: +1 since train_questions[1, :] is the index of the sentence right before the training question.
                # d is a batch of [word indices in sentence, sentence indices from batch] for this story
                d = train_story[:, :(1 + train_questions[1, batch[b]]),
                                train_questions[0, batch[b]]]

                # Pick a fixed number of latest sentences (before the question) from the story
                offset = max(0, d.shape[1] - train_config["sz"])
                d = d[:, offset:]

                # Training data for the 1st memory cell
                memory[0].data[:d.shape[0], :d.shape[1], b] = d

                if enable_time:
                    # Inject noise into time index (i.e. word index)
                    if randomize_time > 0:
                        # Random number of blank (must be < total sentences until the training question?)
                        nblank = np.random.randint(
                            int(math.ceil(d.shape[1] * randomize_time)))
                        rt = np.random.permutation(d.shape[1] + nblank)

                        rt[rt >= train_config["sz"]] = train_config[
                            "sz"] - 1  # put the cap

                        # Add random time (must be > dictionary's length) into the time word (decreasing order)
                        memory[0].data[-1, :d.shape[1], b] = np.sort(
                            rt[:d.shape[1]])[::-1] + len(dictionary)

                    else:
                        memory[0].data[-1, :d.shape[1], b] = \
                            np.arange(d.shape[1])[::-1] + len(dictionary)

                input_data[:, b] = train_qstory[:, batch[b]]

            for i in range(1, nhops):
                memory[i].data = memory[0].data

            out = model.fprop(input_data)
            total_cost += loss.fprop(out, target_data)
            total_err += loss.get_error(out, target_data)
            total_num += batch_size

            grad = loss.bprop(out, target_data)
            model.bprop(input_data, grad)
            model.update(params)

            for i in range(nhops):
                memory[i].emb_query.weight.D[:, 0] = 0

        # Validation
        total_val_err = 0.
        total_val_cost = 0.
        total_val_num = 0

        for k in range(int(math.floor(val_len / batch_size))):
            batch = val_range[np.arange(k * batch_size, (k + 1) * batch_size)]
            input_data = np.zeros((train_story.shape[0], batch_size),
                                  np.float32)
            target_data = train_questions[2, batch]

            memory[0].data[:] = dictionary["nil"]

            for b in range(batch_size):
                d = train_story[:, :(1 + train_questions[1, batch[b]]),
                                train_questions[0, batch[b]]]

                offset = max(0, d.shape[1] - train_config["sz"])
                d = d[:, offset:]

                # Data for the 1st memory cell
                memory[0].data[:d.shape[0], :d.shape[1], b] = d

                if enable_time:
                    memory[0].data[-1, :d.shape[1], b] = np.arange(
                        d.shape[1])[::-1] + len(dictionary)

                input_data[:, b] = train_qstory[:, batch[b]]

            for i in range(1, nhops):
                memory[i].data = memory[0].data

            out = model.fprop(input_data)
            total_val_cost += loss.fprop(out, target_data)
            total_val_err += loss.get_error(out, target_data)
            total_val_num += batch_size

        train_error = total_err / total_num
        val_error = total_val_err / total_val_num

        print("%d | train error: %g | val error: %g" %
              (ep + 1, train_error, val_error))
Exemple #13
0
def threshold_table(start,
                    stop,
                    reading_channels,
                    channels,
                    bands,
                    label='kmeans-labels',
                    filename=DEFAULT_FILENAME,
                    prefix='.'):
    """
    Makes a html table of 'percent increase' from the largest cluster by band and channel.
    """
    data = TimeSeriesDict.read(filename,
                               reading_channels + [label],
                               start=to_gps(start),
                               end=to_gps(stop))
    labels = data[label]

    clusters = list(range(max(labels.value) + 1))
    cluster_counts = list(
        len(labels.value[labels.value == c]) for c in clusters)
    largest_cluster = cluster_counts.index(max(cluster_counts))
    clusters.remove(largest_cluster)

    logger.info(
        f'Largest cluster found to be Nº{largest_cluster} ({100 * max(cluster_counts) // len(labels.value)}%). Doing {clusters}.'
    )
    cluster_counts.remove(max(cluster_counts))

    def amplitude(channel, cluster):
        """return median amplitude for channel in cluster."""
        try:
            chan = data[channel]
        except KeyError:
            return 0.0
        return median([
            chan.value[i] for i, c in enumerate(labels.value) if c == cluster
        ])

    def threshold(cluster, channel, band) -> str:
        f_channel = f'{channel}_BLRMS_{band}.mean'
        base = amplitude(f_channel, largest_cluster)
        if base != 0.0:
            return str(int(
                100 * (amplitude(f_channel, cluster) - base) / base)) + '%'
        else:
            return str(amplitude(f_channel, cluster))

    range_chan = 'L1:DMT-SNSH_EFFECTIVE_RANGE_MPC.mean'
    if range_chan in reading_channels:
        base_range = amplitude(range_chan, largest_cluster)
        if base_range != 0.0:
            snsh = lambda c: 'SNSH: ' + str(
                int(100 * (amplitude(range_chan, c) - base_range) / base_range)
            ) + '%'
        else:
            snsh = lambda c: 'SNSH: 0.0'
    else:
        snsh = lambda c: ''

    with Progress('taking thresholds', len(clusters)) as progress:
        for i, cluster in enumerate(clusters):
            buffer = [[''] + bands]
            for channel in channels:
                buffer.append([channel] + [
                    progress(threshold, i, cluster, channel, band)
                    for band in bands
                ])
            html_table(
                f'cluster {cluster} ({colors[cluster]}) {snsh(cluster)}',
                csv_writer(buffer, get_path(f'{cluster}', 'csv',
                                            prefix=prefix)),
                get_path(f'{cluster}', 'html', prefix=prefix))
    html_table(
        'Index',
        csv_writer(
            [['clusters:']] +
            [[f'<a href="{cluster}.html">Nº{cluster} ({colors[cluster]})</a>']
             for cluster in clusters], get_path('idx', 'csv', prefix=prefix)),
        get_path('index', 'html', prefix=prefix))
Exemple #14
0
def compute_kmeans(channels,
                   start,
                   stop,
                   history=timedelta(hours=2),
                   filename=DEFAULT_FILENAME,
                   downloader=TimeSeriesDict.get,
                   **kwargs):
    """
    Computes k-means clusters and saves the data and labels to filename.
    **kwargs are forwarded to the KMeans constructor.

    >>> from gwpy.time import tconvert, from_gps
    >>> from datetime import timedelta
    >>> from cluster import compute_kmeans
    >>>
    >>> channels = [f'L1:ISI-GND_STS_ETMX_Z_BLRMS_1_3.mean,m-trend', 'L1:ISI-GND_STS_ETMY_Z_BLRMS_1_3.mean,m-trend']
    >>>
    >>> stop = from_gps(60 * (int(tconvert('now')) // 60)) # gets nearest minute to now
    >>> start = stop - timedelta(days=1)  # cluster the past day
    >>> compute_kmeans(channels, start, stop, filename='my_kmeans.hdf5', n_clusters=5, random_state=0)
    """

    # set up duration (minute-trend data has dt=1min, so reject intervals not on the minute).
    duration = (stop - start).total_seconds() / 60
    assert (stop - start).total_seconds() / 60 == (stop -
                                                   start).total_seconds() // 60
    duration = int(duration)
    logger.info(
        f'Clustering data from {start} to {stop} ({duration} minutes).')

    # download data using TimeSeries.get(), including history of point at t0.
    logger.debug(
        f'Initiating download from {start} to {stop} with history={history}...'
    )
    dl = downloader(channels, start=to_gps(start - history), end=to_gps(stop))
    logger.info(f'Downloaded from {start} to {stop} with history={history}.')

    # generate input matrix of the form [sample1;...;sampleN] with sampleK = [feature1,...,featureN]
    # for sklearn.cluster algorithms. This is the slow part of the function, so a progress bar is shown.
    logger.debug(f'Initiating input matrix generation...')
    with Progress('building input', (duration * 60)) as progress:
        input_data = stack([
            concatenate([
                progress(dl[channel].crop,
                         t,
                         start=to_gps(start + timedelta(seconds=t) - history),
                         end=to_gps(start + timedelta(seconds=t))).value
                for channel in channels
            ]) for t in range(0, int(duration * 60), 60)
        ])

    # verify input matrix dimensions.
    assert input_data.shape == (duration,
                                int(
                                    len(channels) * history.total_seconds() /
                                    60))
    logger.info('Completed input matrix generation.')

    # actually do the fit.
    logger.debug(f'Initiating KMeans({kwargs}) fit...')
    kmeans = KMeans(**kwargs).fit(input_data)
    logger.info(f'Completed KMeans({kwargs}) fit.')

    # cast the output labels to a TimeSeries so that cropping is easy later on.
    labels = TimeSeries(kmeans.labels_,
                        times=dl[channels[0]].crop(start=to_gps(start),
                                                   end=to_gps(stop)).times,
                        name='kmeans-labels')

    # put labels in data download dictionary for easy saving.
    dl[labels.name] = labels

    # write data download and labels to specified filename.
    cache_file = abspath(filename)
    if exists(cache_file):
        remove(cache_file)
    dl.write(cache_file)
    logger.info(f'Wrote cache to {filename}')
Exemple #15
0
def cluster_plotter(channels,
                    start,
                    stop,
                    prefix='.',
                    label='kmeans-labels',
                    groups=None,
                    filename=DEFAULT_FILENAME,
                    dqflag='L1:DMT-ANALYSIS_READY:1',
                    xscale=None,
                    unit=None,
                    progressbar=True,
                    **kwargs):
    """
    Plots data with clusters labeled by color in the working directory, or a relative path given by prefix.
    Requires a .hdf5 file produced with a clustering function defined in this module to be in the working directory.
    **kwargs are forwarded to TimeSeries.plot().

    :param prefix: relative path to output images.
    :param label: name attribute of labels TimeSeries saved in filename.
    :param groups: groups of channels to plot in the same figure. See the example.
    :param dqflag: data quality flag for segments bar.
    :param xscale: gps x-axis scale to use.
    :param unit: override y-axis unit.
    :param progressbar: show progress bar.

    >>> from gwpy.time import tconvert, from_gps
    >>> from datetime import timedelta
    >>> from cluster import cluster_plotter
    >>>
    >>> channels = [f'L1:ISI-GND_STS_ETMX_Z_BLRMS_1_3.mean,m-trend', 'L1:ISI-GND_STS_ETMY_Z_BLRMS_1_3.mean,m-trend']
    >>> groups = [[channels, ('ETMX', 'ETMY'), 'L1:ISI-GND_STS_BLRMS_1_3 Z-axis']] # plot on the same figure.
    >>>
    >>> stop = from_gps(60 * (int(tconvert('now')) // 60)) # gets nearest minute to now
    >>> start = stop - timedelta(days=1)  # cluster the past day
    >>> cluster_plotter(channels, start, stop, filename='my_kmeans.hdf5', groups=groups)

    """

    # some defaults.
    if not kwargs:
        kwargs['color'] = 'k'
        kwargs['alpha'] = 0.3
    if groups is None:
        groups = channels

    # read the data from the save file.
    data = TimeSeriesDict.read(filename,
                               channels + [label],
                               start=to_gps(start),
                               end=to_gps(stop))
    logger.info(f'Read {start} to {stop} from {filename}')

    # get segments for the duration specified. Note that this may require doing `ligo-proxy-init -p`.
    logger.debug(f'Getting segments for {dqflag} from {start} to {stop}...')
    dq = DataQualityFlag.query(dqflag, to_gps(start), to_gps(stop))
    logger.info(f'Got segments for {dqflag} from {start} to {stop}.')

    # plotting is slow, so show a nice progress bar.
    logger.debug('Initiating plotting routine...')
    with Progress('plotting', len(channels),
                  quiet=not progressbar) as progress:

        for p, (group, labels, title) in enumerate(groups):

            # plot the group in one figure.
            plt = Plot(*(data[channel] for channel in group),
                       separate=True,
                       sharex=True,
                       zorder=1,
                       **kwargs)

            # modify the axes one by one.
            axes = plt.get_axes()
            for i, ax in enumerate(axes):

                # namely, add a colored overlay that indicates clustering labels.
                ax.scatter(data[group[i]].times,
                           data[group[i]].value,
                           c=[colors[j] for j in data[label]],
                           edgecolor='',
                           s=4,
                           zorder=2)

                ax.set_ylabel(
                    f'{labels[i]} {data[group[i]].unit if unit is None else unit}'
                )
                setp(ax.get_xticklabels(), visible=False)

            # modify the figure as a whole.
            plt.add_segments_bar(dq, label='')
            if xscale is not None:
                plt.gca().set_xscale(xscale)
            plt.suptitle(title)

            # save to png.
            progress(plt.save, p, get_path(title, 'png', prefix=prefix))

    logger.info(f'Completed plotting for {start} to {stop} from {filename}')
def train(train_story,
          train_questions,
          train_qstory,
          memory,
          model,
          loss_function,
          general_config,
          USE_CUDA=False):
    FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
    LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
    ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

    train_config = general_config.train_config
    dictionary = general_config.dictionary
    nepochs = general_config.nepochs
    nhops = general_config.nhops
    batch_size = general_config.batch_size
    enable_time = general_config.enable_time
    randomize_time = general_config.randomize_time
    lrate_decay_step = general_config.lrate_decay_step

    train_range = general_config.train_range  # indices of training questions
    val_range = general_config.val_range  # indices of validation questions
    train_len = len(train_range)
    val_len = len(val_range)

    params = {
        "lrate": train_config["init_lrate"],
        "max_grad_norm": train_config["max_grad_norm"]
    }

    optimizer = optim.SGD(model.parameters(), lr=params["lrate"])
    for ep in range(nepochs):
        # Decrease learning rate after every decay step

        if (ep + 1) % lrate_decay_step == 0:
            params["lrate"] *= 0.5
            for param_group in optimizer.param_groups:
                param_group['lr'] = params["lrate"]

        total_err = 0.
        total_cost = 0.
        total_num = 0
        for _ in Progress(range(int(math.floor(train_len / batch_size)))):
            # Question batch
            # batch = train_range[np.random.randint(train_len, size=batch_size)]
            batch = train_range[torch.randint(train_len, size=(batch_size, ))]
            #batch = train_range

            input_data = Variable(
                torch.zeros((train_story.shape[0], batch_size),
                            dtype=torch.float32))
            input_data.requires_grad = False

            target_data = Variable(train_questions[2, batch])

            with torch.no_grad():
                memory[0].data[:] = dictionary["nil"]

            # Compose batch of training data
            with torch.no_grad():
                for b in range(batch_size):
                    # NOTE: +1 since train_questions[1, :] is the index of the sentence right before the training question.
                    d = train_story[:, :(1 + train_questions[1, batch[b]]),
                                    train_questions[0, batch[b]]].detach()

                    # Pick a fixed number of latest sentences (before the question) from the story
                    offset = max(0, d.shape[1] - train_config["sz"])
                    d = d[:, offset:].detach()

                    # Training data for the 1st memory cell

                    memory[0].data[:d.shape[0], :d.shape[1], b] = d.detach()

                    if enable_time:
                        # Inject noise into time index (i.e. word index)
                        if randomize_time > 0:
                            # Random number of blank (must be < total sentences until the training question?)
                            nblank = np.random.randint(
                                int(math.ceil(d.shape[1] * randomize_time)))
                            rt = np.random.permutation(d.shape[1] + nblank)

                            rt[rt >= train_config["sz"]] = train_config[
                                "sz"] - 1  # put the cap

                            # Add random time (must be > dictionary's length) into the time word (decreasing order)
                            nparray = np.sort(rt[:d.shape[1]])[::-1] + len(
                                dictionary, )
                            with torch.no_grad():
                                memory[0].data[-1, :d.shape[1],
                                               b] = torch.from_numpy(
                                                   nparray).detach()

                        else:

                            memory[0].data[
                                -1, :d.shape[1], b] = torch.from_numpy(
                                    np.arange(d.data.numpy().shape[1])[::-1] +
                                    len(dictionary))

                    input_data[:, b] = train_qstory[:, batch[b]].detach()

            for i in range(1, nhops):
                with torch.no_grad():
                    memory[i].data = memory[0].data

            #input_data.requires_grad_()
            model.zero_grad()
            for i in memory:
                memory[i].zero_grad()
                memory[i].mod_out.zero_grad()
                memory[i].mod_query.zero_grad()
            optimizer.zero_grad()
            out = model(input_data)
            loss = loss_function(out.view(out.shape[1], -1), target_data)
            total_cost += loss.item()
            y = out.max(0)[1]  # y = out.argmax(axis=0)
            total_err += torch.sum(
                y != target_data)  # total_err += np.sum(y != target_data)
            total_num += batch_size

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           params["max_grad_norm"],
                                           norm_type=2)
            optimizer.step()

            with torch.no_grad():
                for i in range(nhops):
                    memory[i].emb_query.weight[:, 0] = 0

        # Validation
        total_val_err = 0.
        total_val_cost = 0.
        total_val_num = 0

        #input_data.requires_grad_()

        for k in range(int(math.floor(val_len / batch_size))):
            batch = val_range[torch.arange(
                k * batch_size, (k + 1) * batch_size
            )]  # val_range[np.arange(k * batch_size, (k + 1) * batch_size)]
            input_data = torch.zeros(
                (train_story.shape[0], batch_size), dtype=torch.float32
            )  # input_data  = np.zeros((train_story.shape[0], batch_size), np.float32)
            target_data = train_questions[2, batch]

            memory[0].data[:] = dictionary["nil"]

            for b in range(batch_size):
                d = train_story[:, :(1 + train_questions[1, batch[b]]),
                                train_questions[0, batch[b]]]

                offset = max(0, d.shape[1] - train_config["sz"])
                d = d[:, offset:]

                # Data for the 1st memory cell
                memory[0].data[:d.shape[0], :d.shape[1], b] = d

                if enable_time:
                    tensor = torch.arange(d.shape[1])
                    idx = [i for i in range(tensor.size(0) - 1, -1, -1)]
                    idx = torch.LongTensor(idx)
                    inverted_tensor = tensor.index_select(
                        0, idx) + len(dictionary)
                    memory[0].data[
                        -1, :d.shape[1],
                        b] = inverted_tensor  # np.arange(d.shape[1])[::-1] + len(dictionary)

                input_data[:, b] = train_qstory[:, batch[b]]

            for i in range(1, nhops):
                memory[i].data = memory[0].data

            out = model(input_data)
            loss = loss_function(out.view(out.shape[1], -1), target_data)
            total_val_cost += loss.item()

            y = out.max(0)[1]  # y = out.argmax(axis=0)
            total_val_err += torch.sum(
                y != target_data)  # total_err += np.sum(y != target_data)

            total_val_num += batch_size

        train_error = total_err.float() / total_num
        val_error = total_val_err.float() / total_val_num

        print("%d | train error: %g | val error: %g" %
              (ep + 1, train_error, val_error))
Exemple #17
0
def representative_spectra(channels,
                           start,
                           stop,
                           rate,
                           label='kmeans-labels',
                           filename=DEFAULT_FILENAME,
                           prefix='.',
                           downloader=TimeSeriesDict.get,
                           cluster_numbers=None,
                           groups=None,
                           **kwargs):
    """
    Make representative spectra for each cluster based on the median psd for minutes in that cluster.
    Downloads only the raw minutes in the cluster to save.
    """
    if groups is None:
        groups = channels

    # read the labels from the save file.
    labels = TimeSeries.read(filename,
                             label,
                             start=to_gps(start),
                             end=to_gps(stop))
    logger.info(f'Read labels {start} to {stop} from {filename}')

    if cluster_numbers is None:
        clusters = list(range(max(labels.value) + 1))

        cluster_counts = list(
            len(labels.value[labels.value == c]) for c in clusters)
        largest_cluster = cluster_counts.index(max(cluster_counts))
        clusters.remove(largest_cluster)

        logger.info(
            f'Largest cluster found to be Nº{largest_cluster} ({100 * max(cluster_counts) // len(labels.value)}%). Doing {clusters}.'
        )
        cluster_counts.remove(max(cluster_counts))
    else:
        clusters = cluster_numbers
        cluster_counts = list(
            len(labels.value[labels.value == c]) for c in clusters)

    t, v, d = labels.times, labels.value, diff(labels.value)

    pairs = list(
        zip([t[0]] + list(t[:-1][d != 0]),
            list(t[1:][d != 0]) + [t[-1]]))
    values = list(v[:-1][d != 0]) + [v[-1]]
    assert len(pairs) == len(values)  # need to include start-| and |-end
    # l|r l|r l|r l|r
    # l,r l,r l,r l,r
    # l r,l r,l r,l r # zip(start + l[1:], r[:-1] + stop)

    print(pairs)
    for pair in pairs:
        print(int(pair[1].value) - int(pair[0].value))
    print(values)

    # use h5py to make a mutable object pointing to a file on disk.
    save_file, filename = path2h5file(
        get_path(f'spectra-cache {start}', 'hdf5', prefix=prefix))
    logger.debug(f'Initiated hdf5 stream to {filename}')

    logger.info(f'Patching {filename}...')
    for i, (dl_start, end) in enumerate(pairs):
        if values[i] in clusters:
            if not data_exists(channels, to_gps(end).seconds, save_file):
                logger.debug(
                    f'Downloading Nº{values[i]} from {dl_start} to {end}...')
                try:
                    dl = downloader(channels,
                                    start=to_gps(dl_start) - LIGOTimeGPS(60),
                                    end=to_gps(end) + LIGOTimeGPS(seconds=1))
                    out = TimeSeriesDict()
                    for n in dl:
                        out[n] = dl[n].resample(**better_aa_opts(dl[n], rate))
                    write_to_disk(out, to_gps(dl_start).seconds, save_file)
                except RuntimeError:  # Cannot find all relevant data on any known server
                    logger.warning(
                        f"SKIPPING Nº{values[i]} from {dl_start} to {end} !!")

    logger.info('Reading data...')
    data = TimeSeriesDict.read(save_file, channels)

    logger.info('Starting PSD generation...')

    f = data[channels[0]].crop(
        start=to_gps(data[channels[0]].times[-1]) - LIGOTimeGPS(60),
        end=to_gps(data[channels[0]].times[-1])).psd().frequencies

    d = (to_gps(labels.times[-1]).seconds - to_gps(labels.times[1]).seconds)
    for i, cluster in enumerate(clusters):
        try:
            psds = {
                channel: FrequencySeries.read(filename, f'{cluster}-{channel}')
                for channel in channels
            }
            logger.info(f'Loaded Nº{cluster}.')

        except KeyError:

            logger.info(
                f'Doing Nº{cluster} ({100 * cluster_counts[i] / len(labels.value):.2f}% of data)...'
            )
            with Progress(f'psd Nº{cluster} ({i + 1}/{len(clusters)})',
                          len(channels) * d) as progress:
                psds = {
                    channel: FrequencySeries(median(stack([
                        progress(data[channel].crop,
                                 pc * d + (to_gps(time).seconds -
                                           to_gps(labels.times[1]).seconds),
                                 start=to_gps(time) - LIGOTimeGPS(60),
                                 end=to_gps(time)).psd().value
                        for c, time in zip(labels.value, labels.times)
                        if c == cluster
                    ]),
                                                    axis=0),
                                             frequencies=f,
                                             name=f'{cluster}-{channel}')
                    for pc, channel in enumerate(channels)
                }
            for name in psds.keys():
                psds[name].write(filename, **writing_opts)

        # plotting is slow, so show a nice progress bar.
        logger.debug('Initiating plotting routine...')
        with Progress('plotting', len(groups)) as progress:

            for p, (group, lbls, title) in enumerate(groups):
                # plot the group in one figure.
                plt = Plot(*(psds[channel] for channel in group),
                           separate=False,
                           sharex=True,
                           zorder=1,
                           **kwargs)
                # plt.gca().set_xlim((30,60))
                # modify the figure as a whole.
                # plt.add_segments_bar(dq, label='')
                plt.gca().set_xscale('log')
                plt.gca().set_yscale('log')
                plt.suptitle(title)
                plt.legend(lbls)

                # save to png.
                progress(
                    plt.save, p,
                    get_path(f'{cluster}-{title}',
                             'png',
                             prefix=f'{prefix}/{cluster}'))
def wait_error():
    progress = Progress(20)
    for i in range(21):
        if i: progress.increment()
        time.sleep(ERROR_TIME / 21.0)
    progress.done()
Exemple #19
0
def train(train_story,
          train_questions,
          train_qstory,
          memory,
          model,
          loss,
          general_config,
          train_logger,
          val_logger,
          global_batch_iter=0,
          best_val_cost=1000000.,
          best_val_err=1000000.):

    train_config = general_config.train_config
    dictionary = general_config.dictionary
    nepochs = general_config.nepochs
    nhops = general_config.nhops
    batch_size = general_config.batch_size
    enable_time = general_config.enable_time
    randomize_time = general_config.randomize_time
    lrate_decay_step = general_config.lrate_decay_step

    train_range = general_config.train_range  # indices of training questions
    val_range = general_config.val_range  # indices of validation questions
    train_len = len(train_range)
    val_len = len(val_range)

    display_inteval = general_config.display_inteval

    params = {
        "lrate": train_config["init_lrate"],
        "max_grad_norm": train_config["max_grad_norm"]
    }

    if randomize_time > 0:
        print('We use Random Noise (RN) ratio of %.1f' % randomize_time)

    # train/val start
    for ep in range(nepochs):
        # Decrease learning rate after every decay step
        if (ep + 1) % lrate_decay_step == 0:
            params["lrate"] *= 0.5

        total_err = 0.
        total_cost = 0.
        total_num = 0
        batch_iter = 0
        for _ in Progress(range(int(math.floor(train_len / batch_size)))):
            # Question batch
            batch = train_range[np.random.randint(train_len, size=batch_size)]

            input_data = np.zeros((train_story.shape[0], batch_size),
                                  np.float32)  # words of training questions
            target_data = train_questions[2,
                                          batch]  # indices of training answers

            memory[0].data[:] = dictionary["nil"]

            # Compose batch of training data
            for b in range(batch_size):
                # NOTE: +1 since train_questions[1, :] is the index of the sentence right before the training question.
                # d is a batch of [word indices in sentence, sentence indices from batch] for this story
                d = train_story[:, :(1 + train_questions[1, batch[b]]),
                                train_questions[0, batch[b]]]

                # Pick a fixed number of latest sentences (before the question) from the story
                offset = max(0, d.shape[1] - train_config["sz"])
                d = d[:, offset:]

                # Training data for the 1st memory cell
                memory[0].data[:d.shape[0], :d.shape[1], b] = d

                if enable_time:
                    # Inject noise into time index (i.e. word index)
                    if randomize_time > 0:
                        # Random number of blank (must be < total sentences until the training question?)
                        nblank = np.random.randint(
                            int(math.ceil(d.shape[1] * randomize_time)))
                        rt = np.random.permutation(d.shape[1] + nblank)

                        rt[rt >= train_config["sz"]] = train_config[
                            "sz"] - 1  # put the cap

                        # Add random time (must be > dictionary's length) into the time word (decreasing order)
                        memory[0].data[-1, :d.shape[1], b] = np.sort(
                            rt[:d.shape[1]])[::-1] + len(dictionary)

                    else:
                        memory[0].data[-1, :d.shape[1], b] = \
                          np.arange(d.shape[1])[::-1] + len(dictionary)

                input_data[:, b] = train_qstory[:, batch[b]]

            for i in range(1, nhops):
                memory[i].data = memory[0].data

            out = model.fprop(input_data)
            cost = loss.fprop(out, target_data)
            err = loss.get_error(out, target_data)
            total_cost += cost
            total_err += err
            total_num += batch_size

            grad = loss.bprop(out, target_data)
            model.bprop(input_data, grad)
            model.update(params)
            batch_iter += 1

            global_batch_iter += 1

            if batch_iter % display_inteval == 0:
                print("%d | %d | %g | loss: %g | err: %g" %
                      (ep, global_batch_iter, params['lrate'],
                       cost / batch_size, err / batch_size))
                sys.stdout.flush()
                train_logger.write('%d %d %f %f %f\n' %
                                   (ep, global_batch_iter, params['lrate'],
                                    cost / batch_size, err / batch_size))
                train_logger.flush()

            for i in range(nhops):
                memory[i].emb_query.weight.D[:, 0] = 0

        # Validation
        total_val_err = 0.
        total_val_cost = 0.
        total_val_num = 0
        for k in range(int(math.floor(val_len / batch_size))):
            batch = val_range[np.arange(k * batch_size, (k + 1) * batch_size)]
            input_data = np.zeros((train_story.shape[0], batch_size),
                                  np.float32)
            target_data = train_questions[2, batch]

            memory[0].data[:] = dictionary["nil"]

            for b in range(batch_size):
                d = train_story[:, :(1 + train_questions[1, batch[b]]),
                                train_questions[0, batch[b]]]

                offset = max(0, d.shape[1] - train_config["sz"])
                d = d[:, offset:]

                # Data for the 1st memory cell
                memory[0].data[:d.shape[0], :d.shape[1], b] = d

                if enable_time:
                    memory[0].data[-1, :d.shape[1], b] = np.arange(
                        d.shape[1])[::-1] + len(dictionary)

                input_data[:, b] = train_qstory[:, batch[b]]

            for i in range(1, nhops):
                memory[i].data = memory[0].data

            out = model.fprop(input_data)
            val_cost = loss.fprop(out, target_data)
            val_err = loss.get_error(out, target_data)
            total_val_cost += val_cost
            total_val_err += val_err
            total_val_num += batch_size

        current_val_cost = total_val_cost / total_val_num
        current_val_err = total_val_err / total_val_num
        print("%d | %d | val loss: %g | val err: %g" %
              (ep, global_batch_iter, current_val_cost, current_val_err))
        sys.stdout.flush()

        if best_val_cost > current_val_cost:
            print('current: %f, best: %f' % (current_val_cost, best_val_cost))
            best_model = model
            best_memory = memory
            best_val_cost = current_val_cost
            best_val_err = current_val_err
            print('Best val loss: %f, val err: %f' %
                  (best_val_cost, best_val_err))
            sys.stdout.flush()

        train_error = total_err / total_num
        val_error = total_val_err / total_val_num

        val_logger.write('%d %d %f %f %f\n' % \
          (ep, global_batch_iter, params['lrate'], current_val_cost, current_val_err))
        val_logger.flush()

    return train_logger,\
           val_logger,\
           best_model,\
           best_memory,\
           global_batch_iter,\
           best_val_cost,\
           best_val_err
Exemple #20
0
def compute_all(channels,
                start,
                stop,
                history=timedelta(hours=2),
                filename=DEFAULT_FILENAME,
                **kwargs):
    # set up duration (minute-trend data has dt=1min, so reject intervals not on the minute).
    duration = (stop - start).total_seconds() / 60
    assert (stop - start).total_seconds() / 60 == (stop -
                                                   start).total_seconds() // 60
    duration = int(duration)
    logger.info(
        f'Clustering data from {start} to {stop} ({duration} minutes).')

    # download data using TimeSeries.get(), including history of point at t0.
    logger.debug(
        f'Initiating download from {start} to {stop} with history={history}...'
    )
    dl = TimeSeriesDict.get(channels,
                            start=to_gps(start - history),
                            end=to_gps(stop))
    logger.info(f'Downloaded from {start} to {stop} with history={history}.')

    if exists('input.npy'):
        input_data = np.load('input.npy')
        logger.info('Loaded input matrix.')
    else:
        # generate input matrix of the form [sample1;...;sampleN] with sampleK = [feature1,...,featureN]
        # for sklearn.cluster algorithms. This is the slow part of the function, so a progress bar is shown.
        logger.debug(f'Initiating input matrix generation...')
        with Progress('building input', (duration * 60)) as progress:
            input_data = stack([
                concatenate([
                    progress(dl[channel].crop,
                             t,
                             start=to_gps(start + timedelta(seconds=t) -
                                          history),
                             end=to_gps(start + timedelta(seconds=t))).value
                    for channel in channels
                ]) for t in range(0, int(duration * 60), 60)
            ])

        # verify input matrix dimensions.
        assert input_data.shape == (duration,
                                    int(
                                        len(channels) *
                                        history.total_seconds() / 60))
        np.save('input.npy', input_data)
        logger.info('Completed input matrix generation.')

    params = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 15,
        'min_samples': 20,
        'xi': 0.05,
        'min_cluster_size': 0.1
    }

    if exists('X.npy'):
        X = np.load('X.npy')
        logger.info('Loaded X')
    else:
        # normalize dataset for easier parameter selection
        X = StandardScaler().fit_transform(input_data)
        np.save('X.npy', X)
        logger.info('Generated X')

    if exists('bandwidth.npy'):
        bandwidth = np.load('bandwidth.npy')
        logger.info('Loaded bandwidth')
    else:
        # estimate bandwidth for mean shift
        bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])
        np.save('bandwidth.npy', bandwidth)
        logger.info('Generated bandwidth')

    if exists('connectivity.npy'):
        connectivity = np.load('connectivity.npy', allow_pickle=True)
        logger.info('Loaded connectivity')
    else:
        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(X,
                                        n_neighbors=params['n_neighbors'],
                                        include_self=False)
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)
        np.save('connectivity.npy', connectivity)
        logger.info('Generated connectivity')

    ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
    two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
    ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'],
                                           linkage='ward',
                                           connectivity=connectivity)
    spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'],
                                          eigen_solver='arpack',
                                          affinity="nearest_neighbors")
    dbscan = cluster.DBSCAN(eps=params['eps'])
    optics = cluster.OPTICS(min_samples=params['min_samples'],
                            xi=params['xi'],
                            min_cluster_size=params['min_cluster_size'])
    affinity_propagation = cluster.AffinityPropagation(
        damping=params['damping'], preference=params['preference'])
    average_linkage = cluster.AgglomerativeClustering(
        linkage="average",
        affinity="cityblock",
        n_clusters=params['n_clusters'],
        connectivity=connectivity)
    birch = cluster.Birch(n_clusters=params['n_clusters'])
    gmm = mixture.GaussianMixture(n_components=params['n_clusters'],
                                  covariance_type='full')

    clustering_algorithms = (
        ('MiniBatchKMeans', two_means),
        ('AffinityPropagation', affinity_propagation), ('MeanShift', ms),
        ('SpectralClustering', spectral), ('DBSCAN', dbscan),
        ('OPTICS', optics), ('Birch', birch), ('GaussianMixture', gmm)
        # ('Ward', ward),
        # ('AgglomerativeClustering', average_linkage),
    )

    for name, algorithm in clustering_algorithms:
        if exists(f'part-{name}-{filename}'):
            labels = TimeSeries.read(f'part-{name}-{filename}',
                                     f'{name}-labels')
            logger.debug(f'LOADED {name}.')
        else:
            logger.debug(f'doing {name}...')
            # catch warnings related to kneighbors_graph
            with warnings.catch_warnings():
                warnings.filterwarnings(
                    "ignore",
                    message="the number of connected components of the " +
                    "connectivity matrix is [0-9]{1,2}" +
                    " > 1. Completing it to avoid stopping the tree early.",
                    category=UserWarning)
                warnings.filterwarnings(
                    "ignore",
                    message="Graph is not fully connected, spectral embedding"
                    + " may not work as expected.",
                    category=UserWarning)
                algorithm.fit(X)

            if hasattr(algorithm, 'labels_'):
                y_pred = algorithm.labels_.astype(np.int)
            else:
                y_pred = algorithm.predict(X)
            # cast the output labels to a TimeSeries so that cropping is easy later on.
            labels = TimeSeries(
                y_pred,
                times=dl[channels[0]].crop(start=to_gps(start),
                                           end=to_gps(stop)).times,
                name=f'{name}-labels')

            labels.write(f'part-{name}-{filename}')
        # put labels in data download dictionary for easy saving.
        dl[labels.name] = labels

    # write data download and labels to specified filename.
    cache_file = abspath(filename)
    if exists(cache_file):
        remove(cache_file)
    dl.write(cache_file)
    logger.info(f'Wrote cache to {filename}')