Beispiel #1
0
    def _predict(self, tweet_reps):
        with_output = not self.PARAMS.suppress_output

        i = 0
        top = len(tweet_reps)
        predictions = []
        if with_output:
            label = 'predicting'
            status_update(i, top, label)

        while i < top:
            s = min(i + self.PARAMS.batch_size, top) - i

            i, xs = self._next_batch(tweet_reps, i)

            assert all(
                len(x) == len(xs[0]) for x in xs
            ), f'ERR: differently-sized tensors: min={min([len(x) for x in xs])}, max={max([len(x) for x in xs])}'

            preds = self._session.run(tf.slice(tf.argmax(self._model, 1), [0],
                                               [s]),
                                      feed_dict={
                                          self._x_input: xs,
                                          self._keep_prob: 1
                                      })
            predictions.extend(preds)
            if with_output:
                status_update(i, top, label)

        assert len(predictions) == len(tweet_reps)

        return predictions
Beispiel #2
0
    def _train(self, tweet_reps):
        with_output = not self.PARAMS.suppress_output

        top = len(tweet_reps)

        for e in range(self.PARAMS.nof_iterations):
            i = 0
            if with_output:
                label = 'epoch {:>2}'.format(e + 1)
                status_update(i, top, label)

            nprand.shuffle(tweet_reps)
            while i < top:
                i, batch = self._next_batch(tweet_reps, i)
                xs, ys = zip(*batch)

                self._session.run(self._train_step,
                                  feed_dict={
                                      self._x_input:
                                      xs,
                                      self._y_input:
                                      ys,
                                      self._keep_prob:
                                      self.PARAMS.dropout_keep_probability
                                  })
                self._save_clf()
                if with_output:
                    status_update(i, top, label)

            if with_output and e % self.PARAMS.print_frequency == 0:
                accuracy = self._test(tweet_reps)
                print(f'accuracy(training set) = {accuracy}')
Beispiel #3
0
    def _test(self, tweet_reps):
        with_output = not self.PARAMS.suppress_output

        top = len(tweet_reps)

        i = 0
        nof_hits = 0
        if with_output:
            label = 'calculating accuracy'
            status_update(i, top, label)

        while i < top:
            s = min(i + self.PARAMS.batch_size, top) - i
            i, batch = self._next_batch(tweet_reps, i)
            xs, ys = zip(*batch)

            nof_hits += self._session.run(tf.reduce_sum(
                tf.cast(tf.slice(self._correct_predictions, [0], [s]),
                        tf.float32)),
                                          feed_dict={
                                              self._x_input: xs,
                                              self._y_input: ys,
                                              self._keep_prob: 1
                                          })
            if with_output:
                status_update(i, top, label)

        return nof_hits / top
Beispiel #4
0
    def _predict(self, tweet_reps):
        nof_tweets = len(tweet_reps)
        predictions = []

        i = 0
        do_output = not PARAMS.suppress_output and PARAMS.use_padding
        if do_output:
            top = nof_tweets
            status_update(i, top)
        while i < nof_tweets:

            k = min(i + PARAMS.batch_size, nof_tweets)
            batch = list(tweet_reps[i:k])

            missing = (PARAMS.batch_size - len(batch))
            if missing > 0:
                batch += [batch[0]] * missing
                if self.debug:
                    assert len(batch) == PARAMS.batch_size

            preds = self._session.run(tf.slice(tf.argmax(self._model, 1), [0],
                                               [k - i]),
                                      feed_dict={
                                          self._x_input: batch,
                                          self._keep_prob: 1
                                      })

            predictions += preds.tolist()
            i += PARAMS.batch_size

            if do_output:
                status_update(i, max([i, top]))

        return predictions
Beispiel #5
0
    def _train_all_padded(self, tdict):
        def random_selection(lst, amount=PARAMS.batch_size):
            idxs = nprand.randint(0, len(lst), amount)
            return zip(*[lst[i] for i in idxs])

        padded = self._pad(tdict)
        len_max = len(padded[0][0])

        if not PARAMS.suppress_output:
            print("training...")

        top = len(padded)

        with self._session.as_default():
            for epoch in range(PARAMS.nof_iterations):
                if epoch % PARAMS.print_frequency == 0 and not PARAMS.suppress_output:
                    acc = self._test({len_max: padded})
                    print('accuracy(training set) =', acc)

                label = f'epoch {epoch}'
                curr = 0
                if not PARAMS.suppress_output:
                    status_update(curr, top, label=label)

                nprand.shuffle(padded)

                xs, ys = zip(*padded)
                i = 0
                while i < len(xs):
                    k = min(i + PARAMS.batch_size, len(xs))
                    batch_xs = list(xs[i:k])
                    batch_ys = list(ys[i:k])

                    missing = (PARAMS.batch_size - len(batch_xs))
                    if missing > 0:
                        miss_xs, miss_ys = random_selection(padded,
                                                            amount=missing)
                        batch_xs += miss_xs
                        batch_ys += miss_ys
                        if self.debug:
                            assert len(batch_xs) == PARAMS.batch_size
                            assert len(batch_ys) == PARAMS.batch_size

                    feed_dict = {
                        self._x_input: batch_xs,
                        self._y_input: batch_ys,
                        self._keep_prob: PARAMS.dropout_keep_probability
                    }
                    self._session.run(self._train_step, feed_dict=feed_dict)
                    i += PARAMS.batch_size

                    if not PARAMS.suppress_output:
                        status_update(i, top, label=label)

                if not PARAMS.suppress_output:
                    print('saving ...')
                saver = tf.train.Saver(self._tf_variables)
                saver.save(self._session,
                           self._save_as,
                           write_meta_graph=False)
Beispiel #6
0
    def predict(self, tweets):
        tweet_reps = [self._representation(t) for t in tweets]
        nof_tweets = len(tweet_reps)

        if (self.debug):
            print(f'predicting {nof_tweets} tweets')

        if PARAMS.use_padding:
            if not PARAMS.suppress_output:
                print('predicting with padding')
            max_len = np.max([len(tr) for tr in tweet_reps])
            tweet_reps = [
                np.pad(tr, pad_width=(0, max_len - len(tr)), mode='constant')
                for tr in tweet_reps
            ]
            return self._predict(tweet_reps)

        else:

            predictions = []
            """The CNN expects all samples in a batch to have the same length."""

            permutation, tweet_reps = zip(
                *sorted(enumerate(tweet_reps), key=lambda x: len(x[1])))
            i = 0
            j = 1
            if not PARAMS.suppress_output:
                status_update(i, nof_tweets, label="Predicting")
            while i < nof_tweets:
                curr_len = len(tweet_reps[i])
                while j < nof_tweets and len(tweet_reps[j]) == curr_len:
                    j += 1

                curr_preds = self._predict(tweet_reps[i:j])
                predictions += curr_preds

                assert len(
                    curr_preds
                ) == j - i, f"ERR: i,j = {i,j}; expected {j-i} predictions but got {len(curr_preds)}!\ncurr = {curr_preds}\nreps={tweet_reps[i:j]}"

                i = j
                j = i + 1
                if not PARAMS.suppress_output:
                    status_update(i, nof_tweets, label="Predicting")

            assert len(
                predictions
            ) == nof_tweets, f"ERR: expected {nof_tweets} predictions but got {len(predictions)}"

            inverse_permutation = np.argsort(permutation)

            return [predictions[i] for i in inverse_permutation]
Beispiel #7
0
    def _train_random(self, tdict):
        def next_batch(ex_len, amount=PARAMS.batch_size):
            exs = tdict[ex_len]
            idxs = nprand.randint(0, len(exs), amount)
            return zip(*[exs[i] for i in idxs])

        """TRAINING"""
        with self._session.as_default():
            for it in range(PARAMS.nof_iterations):
                if it % PARAMS.print_frequency == 0:
                    k = nprand.randint(0, len(tdict.keys()))
                    k = list(tdict.keys())[k]
                    xs, ys = next_batch(k)
                    feed_dict = {
                        self._x_input: xs,
                        self._y_input: ys,
                        self._keep_prob: 1
                    }
                    accuracy = self._accuracy.eval(feed_dict=feed_dict)
                    if not PARAMS.suppress_output:
                        print(
                            f'iteration {it}; acc(random sample) = {accuracy}')

                    if self.debug:
                        output = self._session.run(self._model,
                                                   feed_dict=feed_dict)
                        print('expected', ys)
                        print('got', output)

                top = len(list(tdict.keys())) - 1
                curr = 0
                for i in tdict.keys():
                    if not PARAMS.suppress_output:
                        status_update(curr, top, label=f'training size {i}')
                    curr += 1
                    xs, ys = next_batch(i)
                    feed_dict = {
                        self._x_input: xs,
                        self._y_input: ys,
                        self._keep_prob: PARAMS.dropout_keep_probability
                    }
                    self._session.run(self._train_step, feed_dict=feed_dict)

                if not PARAMS.suppress_output:
                    print('saving ...')
                saver = tf.train.Saver(self._tf_variables)
                saver.save(self._session,
                           self._save_as,
                           write_meta_graph=False)
Beispiel #8
0
    def _test(self, tdict):
        nof_hits = 0
        nof_samples = 0

        top = len(list(tdict.keys()))
        curr = 0
        label = 'calculating accuracy'
        if not PARAMS.suppress_output:
            status_update(curr, top, label=label)
        for tpl_lst in tdict.values():
            nprand.shuffle(tpl_lst)
            nof_samples += len(tpl_lst)
            xs, ys = zip(*tpl_lst)
            i = 0
            while i < len(xs):
                k = min(i + PARAMS.batch_size, len(xs))
                batch_xs = list(xs[i:k])
                batch_ys = list(ys[i:k])

                missing = (PARAMS.batch_size - len(batch_xs))
                if missing > 0:
                    batch_xs += [batch_xs[0]] * missing
                    batch_ys += [batch_ys[0]] * missing
                    if self.debug:
                        assert len(batch_xs) == PARAMS.batch_size
                        assert len(batch_ys) == PARAMS.batch_size

                nof_hits += self._session.run(tf.reduce_sum(
                    tf.cast(tf.slice(self._correct_predictions, [0], [k - i]),
                            tf.float32)),
                                              feed_dict={
                                                  self._x_input: batch_xs,
                                                  self._y_input: batch_ys,
                                                  self._keep_prob: 1
                                              })
                i += PARAMS.batch_size

            if not PARAMS.suppress_output:
                curr += 1
                status_update(curr, top, label=label)

        return nof_hits / nof_samples
def kmeans(inputMatrix, k,iterationCount=20):

    numberOfPoints = inputMatrix.shape[0]
    dimensions = inputMatrix.shape[1]
    
    #colors = np.array(['bo', 'go', 'ro','co','mo','yo','ko'])
    
    clusterNumber = np.random.randint(0,k,numberOfPoints)
    
    centers = np.empty([k, dimensions])
    
    randomNumbers = np.random.choice(numberOfPoints, k, replace=False)
    
    for i in range(k):
        centers[i] = inputMatrix[randomNumbers[i]]
                               
    #print(centers)
    
    #plt.figure(1)
    
    """for i in range(numberOfPoints):
        plt.plot(inputMatrix[i][0],inputMatrix[i][1],colors[clusterNumber[i]])
        
    for i in range(k):
        plt.plot(centers[i][0],centers[i][1],'kD')"""
    
    for iteration in range(iterationCount):
        
        error = 0
        
        #--------------------Begin of Cluster Assignment-------------------
        
        for i in range(numberOfPoints):
            if i % 1000 == 0:
                status_update(i+iteration*numberOfPoints,(numberOfPoints-1)*iterationCount-1)
                
            assignPoint = inputMatrix[i]
            bestDistance = np.inf
            for j in range(k):
                candPoint = centers[j]
                distance = np.linalg.norm(assignPoint-candPoint)
                error = error + distance;
                if(distance < bestDistance):
                    bestDistance = distance
                    clusterNumber[i] = j
                                 
        #--------------------End of Cluster Assignment-------------------
        
        #plt.figure(iteration*2+1)
        
        """for i in range(numberOfPoints):
            plt.plot(inputMatrix[i][0],inputMatrix[i][1],colors[clusterNumber[i]])
            
        for i in range(k):
            plt.plot(centers[i][0],centers[i][1],'kD')"""
        
        #-------------------Begin of new mean computation----------------
        
        centers = np.zeros([k, dimensions])
        counts = np.zeros([k, 1])
        
        for i in range(numberOfPoints):
            assignedCenterNumber = clusterNumber[i]
            centers[assignedCenterNumber] = centers[assignedCenterNumber] + inputMatrix[i]
            counts[assignedCenterNumber] = counts[assignedCenterNumber]+1
        
        for j in range(k):
            centers[j] = centers[j] / counts[j];
    
    #plt.figure()
    
    #colours = [clusterNumber[i]+1 for i in range(numberOfPoints)]
    #plt.scatter(inputMatrix[:,0],inputMatrix[:,1],c=colours)

    #for i in range(numberOfPoints):
        #plt.plot(inputMatrix[i][0],inputMatrix[i][1],colors[clusterNumber[i]])
        
    #for i in range(k):
        #plt.plot(centers[i][0],centers[i][1],'kD')
        
    #print(error)
    
    return centers