def _predict(self, tweet_reps): with_output = not self.PARAMS.suppress_output i = 0 top = len(tweet_reps) predictions = [] if with_output: label = 'predicting' status_update(i, top, label) while i < top: s = min(i + self.PARAMS.batch_size, top) - i i, xs = self._next_batch(tweet_reps, i) assert all( len(x) == len(xs[0]) for x in xs ), f'ERR: differently-sized tensors: min={min([len(x) for x in xs])}, max={max([len(x) for x in xs])}' preds = self._session.run(tf.slice(tf.argmax(self._model, 1), [0], [s]), feed_dict={ self._x_input: xs, self._keep_prob: 1 }) predictions.extend(preds) if with_output: status_update(i, top, label) assert len(predictions) == len(tweet_reps) return predictions
def _train(self, tweet_reps): with_output = not self.PARAMS.suppress_output top = len(tweet_reps) for e in range(self.PARAMS.nof_iterations): i = 0 if with_output: label = 'epoch {:>2}'.format(e + 1) status_update(i, top, label) nprand.shuffle(tweet_reps) while i < top: i, batch = self._next_batch(tweet_reps, i) xs, ys = zip(*batch) self._session.run(self._train_step, feed_dict={ self._x_input: xs, self._y_input: ys, self._keep_prob: self.PARAMS.dropout_keep_probability }) self._save_clf() if with_output: status_update(i, top, label) if with_output and e % self.PARAMS.print_frequency == 0: accuracy = self._test(tweet_reps) print(f'accuracy(training set) = {accuracy}')
def _test(self, tweet_reps): with_output = not self.PARAMS.suppress_output top = len(tweet_reps) i = 0 nof_hits = 0 if with_output: label = 'calculating accuracy' status_update(i, top, label) while i < top: s = min(i + self.PARAMS.batch_size, top) - i i, batch = self._next_batch(tweet_reps, i) xs, ys = zip(*batch) nof_hits += self._session.run(tf.reduce_sum( tf.cast(tf.slice(self._correct_predictions, [0], [s]), tf.float32)), feed_dict={ self._x_input: xs, self._y_input: ys, self._keep_prob: 1 }) if with_output: status_update(i, top, label) return nof_hits / top
def _predict(self, tweet_reps): nof_tweets = len(tweet_reps) predictions = [] i = 0 do_output = not PARAMS.suppress_output and PARAMS.use_padding if do_output: top = nof_tweets status_update(i, top) while i < nof_tweets: k = min(i + PARAMS.batch_size, nof_tweets) batch = list(tweet_reps[i:k]) missing = (PARAMS.batch_size - len(batch)) if missing > 0: batch += [batch[0]] * missing if self.debug: assert len(batch) == PARAMS.batch_size preds = self._session.run(tf.slice(tf.argmax(self._model, 1), [0], [k - i]), feed_dict={ self._x_input: batch, self._keep_prob: 1 }) predictions += preds.tolist() i += PARAMS.batch_size if do_output: status_update(i, max([i, top])) return predictions
def _train_all_padded(self, tdict): def random_selection(lst, amount=PARAMS.batch_size): idxs = nprand.randint(0, len(lst), amount) return zip(*[lst[i] for i in idxs]) padded = self._pad(tdict) len_max = len(padded[0][0]) if not PARAMS.suppress_output: print("training...") top = len(padded) with self._session.as_default(): for epoch in range(PARAMS.nof_iterations): if epoch % PARAMS.print_frequency == 0 and not PARAMS.suppress_output: acc = self._test({len_max: padded}) print('accuracy(training set) =', acc) label = f'epoch {epoch}' curr = 0 if not PARAMS.suppress_output: status_update(curr, top, label=label) nprand.shuffle(padded) xs, ys = zip(*padded) i = 0 while i < len(xs): k = min(i + PARAMS.batch_size, len(xs)) batch_xs = list(xs[i:k]) batch_ys = list(ys[i:k]) missing = (PARAMS.batch_size - len(batch_xs)) if missing > 0: miss_xs, miss_ys = random_selection(padded, amount=missing) batch_xs += miss_xs batch_ys += miss_ys if self.debug: assert len(batch_xs) == PARAMS.batch_size assert len(batch_ys) == PARAMS.batch_size feed_dict = { self._x_input: batch_xs, self._y_input: batch_ys, self._keep_prob: PARAMS.dropout_keep_probability } self._session.run(self._train_step, feed_dict=feed_dict) i += PARAMS.batch_size if not PARAMS.suppress_output: status_update(i, top, label=label) if not PARAMS.suppress_output: print('saving ...') saver = tf.train.Saver(self._tf_variables) saver.save(self._session, self._save_as, write_meta_graph=False)
def predict(self, tweets): tweet_reps = [self._representation(t) for t in tweets] nof_tweets = len(tweet_reps) if (self.debug): print(f'predicting {nof_tweets} tweets') if PARAMS.use_padding: if not PARAMS.suppress_output: print('predicting with padding') max_len = np.max([len(tr) for tr in tweet_reps]) tweet_reps = [ np.pad(tr, pad_width=(0, max_len - len(tr)), mode='constant') for tr in tweet_reps ] return self._predict(tweet_reps) else: predictions = [] """The CNN expects all samples in a batch to have the same length.""" permutation, tweet_reps = zip( *sorted(enumerate(tweet_reps), key=lambda x: len(x[1]))) i = 0 j = 1 if not PARAMS.suppress_output: status_update(i, nof_tweets, label="Predicting") while i < nof_tweets: curr_len = len(tweet_reps[i]) while j < nof_tweets and len(tweet_reps[j]) == curr_len: j += 1 curr_preds = self._predict(tweet_reps[i:j]) predictions += curr_preds assert len( curr_preds ) == j - i, f"ERR: i,j = {i,j}; expected {j-i} predictions but got {len(curr_preds)}!\ncurr = {curr_preds}\nreps={tweet_reps[i:j]}" i = j j = i + 1 if not PARAMS.suppress_output: status_update(i, nof_tweets, label="Predicting") assert len( predictions ) == nof_tweets, f"ERR: expected {nof_tweets} predictions but got {len(predictions)}" inverse_permutation = np.argsort(permutation) return [predictions[i] for i in inverse_permutation]
def _train_random(self, tdict): def next_batch(ex_len, amount=PARAMS.batch_size): exs = tdict[ex_len] idxs = nprand.randint(0, len(exs), amount) return zip(*[exs[i] for i in idxs]) """TRAINING""" with self._session.as_default(): for it in range(PARAMS.nof_iterations): if it % PARAMS.print_frequency == 0: k = nprand.randint(0, len(tdict.keys())) k = list(tdict.keys())[k] xs, ys = next_batch(k) feed_dict = { self._x_input: xs, self._y_input: ys, self._keep_prob: 1 } accuracy = self._accuracy.eval(feed_dict=feed_dict) if not PARAMS.suppress_output: print( f'iteration {it}; acc(random sample) = {accuracy}') if self.debug: output = self._session.run(self._model, feed_dict=feed_dict) print('expected', ys) print('got', output) top = len(list(tdict.keys())) - 1 curr = 0 for i in tdict.keys(): if not PARAMS.suppress_output: status_update(curr, top, label=f'training size {i}') curr += 1 xs, ys = next_batch(i) feed_dict = { self._x_input: xs, self._y_input: ys, self._keep_prob: PARAMS.dropout_keep_probability } self._session.run(self._train_step, feed_dict=feed_dict) if not PARAMS.suppress_output: print('saving ...') saver = tf.train.Saver(self._tf_variables) saver.save(self._session, self._save_as, write_meta_graph=False)
def _test(self, tdict): nof_hits = 0 nof_samples = 0 top = len(list(tdict.keys())) curr = 0 label = 'calculating accuracy' if not PARAMS.suppress_output: status_update(curr, top, label=label) for tpl_lst in tdict.values(): nprand.shuffle(tpl_lst) nof_samples += len(tpl_lst) xs, ys = zip(*tpl_lst) i = 0 while i < len(xs): k = min(i + PARAMS.batch_size, len(xs)) batch_xs = list(xs[i:k]) batch_ys = list(ys[i:k]) missing = (PARAMS.batch_size - len(batch_xs)) if missing > 0: batch_xs += [batch_xs[0]] * missing batch_ys += [batch_ys[0]] * missing if self.debug: assert len(batch_xs) == PARAMS.batch_size assert len(batch_ys) == PARAMS.batch_size nof_hits += self._session.run(tf.reduce_sum( tf.cast(tf.slice(self._correct_predictions, [0], [k - i]), tf.float32)), feed_dict={ self._x_input: batch_xs, self._y_input: batch_ys, self._keep_prob: 1 }) i += PARAMS.batch_size if not PARAMS.suppress_output: curr += 1 status_update(curr, top, label=label) return nof_hits / nof_samples
def kmeans(inputMatrix, k,iterationCount=20): numberOfPoints = inputMatrix.shape[0] dimensions = inputMatrix.shape[1] #colors = np.array(['bo', 'go', 'ro','co','mo','yo','ko']) clusterNumber = np.random.randint(0,k,numberOfPoints) centers = np.empty([k, dimensions]) randomNumbers = np.random.choice(numberOfPoints, k, replace=False) for i in range(k): centers[i] = inputMatrix[randomNumbers[i]] #print(centers) #plt.figure(1) """for i in range(numberOfPoints): plt.plot(inputMatrix[i][0],inputMatrix[i][1],colors[clusterNumber[i]]) for i in range(k): plt.plot(centers[i][0],centers[i][1],'kD')""" for iteration in range(iterationCount): error = 0 #--------------------Begin of Cluster Assignment------------------- for i in range(numberOfPoints): if i % 1000 == 0: status_update(i+iteration*numberOfPoints,(numberOfPoints-1)*iterationCount-1) assignPoint = inputMatrix[i] bestDistance = np.inf for j in range(k): candPoint = centers[j] distance = np.linalg.norm(assignPoint-candPoint) error = error + distance; if(distance < bestDistance): bestDistance = distance clusterNumber[i] = j #--------------------End of Cluster Assignment------------------- #plt.figure(iteration*2+1) """for i in range(numberOfPoints): plt.plot(inputMatrix[i][0],inputMatrix[i][1],colors[clusterNumber[i]]) for i in range(k): plt.plot(centers[i][0],centers[i][1],'kD')""" #-------------------Begin of new mean computation---------------- centers = np.zeros([k, dimensions]) counts = np.zeros([k, 1]) for i in range(numberOfPoints): assignedCenterNumber = clusterNumber[i] centers[assignedCenterNumber] = centers[assignedCenterNumber] + inputMatrix[i] counts[assignedCenterNumber] = counts[assignedCenterNumber]+1 for j in range(k): centers[j] = centers[j] / counts[j]; #plt.figure() #colours = [clusterNumber[i]+1 for i in range(numberOfPoints)] #plt.scatter(inputMatrix[:,0],inputMatrix[:,1],c=colours) #for i in range(numberOfPoints): #plt.plot(inputMatrix[i][0],inputMatrix[i][1],colors[clusterNumber[i]]) #for i in range(k): #plt.plot(centers[i][0],centers[i][1],'kD') #print(error) return centers